diff --git a/configure.ac b/configure.ac index a46ac7747..4aba828b3 100644 --- a/configure.ac +++ b/configure.ac @@ -2607,6 +2607,22 @@ then ENABLED_ARMASM_NEON=no AC_MSG_NOTICE([32bit ARMv7-m found]) ;; + armv6*) + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv6 -fomit-frame-pointer -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=6" + AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" + ENABLED_ARMASM_CRYPTO=no + ENABLED_AESGCM_STREAM=no # not yet implemented + ENABLED_ARMASM_NEON=no + AC_MSG_NOTICE([32bit ARMv6 found]) + ;; + armv4*) + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv4 -fomit-frame-pointer -DWOLFSSL_ARMASM_NO_HW_CRYPTO -DWOLFSSL_ARM_ARCH=4" + AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN" + ENABLED_ARMASM_CRYPTO=no + ENABLED_AESGCM_STREAM=no # not yet implemented + ENABLED_ARMASM_NEON=no + AC_MSG_NOTICE([32bit ARMv4 found]) + ;; *) AM_CPPFLAGS="$AM_CPPFLAGS -mfpu=crypto-neon-fp-armv8 -marm" # Include options.h @@ -7500,21 +7516,47 @@ if test "$ENABLED_SP_ASM" = "yes" && test "$ENABLED_SP" = "yes"; then AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM64_ASM" ENABLED_SP_ARM64_ASM=yes ;; + *armv7a*) + if test "$ENABLED_ARMASM" = "no"; then + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-a -mfpu=neon -DWOLFSSL_ARM_ARCH=7 -marm" + fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM" + ENABLED_SP_ARM32_ASM=yes + ;; + *cortex* | *armv7m*) + if test "$ENABLED_ARMASM" = "no"; then + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv7-r -D__thumb__ -DWOLFSSL_ARM_ARCH=7" + fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_M_ASM" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_CORTEX_M_ASM" + ENABLED_SP_ARM_CORTEX_ASM=yes + ;; + *armv6*) + if test "$ENABLED_ARMASM" = "no"; then + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv6 -DWOLFSSL_ARM_ARCH=6" + fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM" + ENABLED_SP_ARM32_ASM=yes + ;; + *armv4*) + if test "$ENABLED_ARMASM" = "no"; then + AM_CPPFLAGS="$AM_CPPFLAGS -march=armv4 -DWOLFSSL_ARM_ARCH=4" + fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM" + ENABLED_SP_ARM32_ASM=yes + ;; *arm*) if test "$host_alias" = "thumb" || test "$ARM_TARGET" = "thumb"; then AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM" AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM" ENABLED_SP_ARM_THUMB_ASM=yes else - if test "$host_alias" = "cortex" || test "$ARM_TARGET" = "cortex"; then - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_M_ASM" - AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_CORTEX_M_ASM" - ENABLED_SP_ARM_CORTEX_ASM=yes - else - AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM" - AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM" - ENABLED_SP_ARM32_ASM=yes - fi + AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM" + AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM" + ENABLED_SP_ARM32_ASM=yes fi ;; *x86_64* | *amd64*) diff --git a/src/include.am b/src/include.am index 88cfdb02e..c222d5c67 100644 --- a/src/include.am +++ b/src/include.am @@ -171,8 +171,10 @@ endif !BUILD_ARMASM_CRYPTO else if BUILD_ARMASM if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm.S endif !BUILD_ARMASM_INLINE endif BUILD_ARMASM @@ -203,8 +205,10 @@ else if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm.S endif !BUILD_ARMASM_INLINE else @@ -229,8 +233,10 @@ else if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm.S endif !BUILD_ARMASM_INLINE else @@ -326,8 +332,10 @@ else if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha256-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha256-asm.S endif !BUILD_ARMASM_INLINE else @@ -427,8 +435,10 @@ endif !BUILD_ARMASM_CRYPTO else if BUILD_ARMASM if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-aes-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-aes-asm.S endif !BUILD_ARMASM_INLINE endif BUILD_ARMASM @@ -472,8 +482,10 @@ else if BUILD_ARMASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha512.c if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-sha512-asm.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-sha512-asm.S endif !BUILD_ARMASM_INLINE else @@ -713,9 +725,11 @@ src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519. endif !BUILD_ARMASM_INLINE else if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519_c.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519_c.c else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-32-curve25519.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-curve25519.S src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-curve25519.S endif !BUILD_ARMASM_INLINE diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index cb238f8e5..12578411f 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -601,9 +601,36 @@ L_AES_invert_key_loop: sub r11, r1, #1 L_AES_invert_key_mix_loop: ldm r0, {r2, r3, r4, r5} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r2, #24 + lsr r6, r6, #24 +#else + uxtb r6, r2 +#endif +#else ubfx r6, r2, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r2, #16 + lsr r7, r7, #24 +#else + uxtb r7, r2, ror #8 +#endif +#else ubfx r7, r2, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r2, #8 + lsr r8, r8, #24 +#else + uxtb r8, r2, ror #16 +#endif +#else ubfx r8, r2, #16, #8 +#endif lsr r9, r2, #24 ldrb r6, [r12, r6, lsl #2] ldrb r7, [r12, r7, lsl #2] @@ -617,9 +644,36 @@ L_AES_invert_key_mix_loop: eor r8, r8, r7, ror #8 eor r8, r8, r9, ror #24 str r8, [r0], #4 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r3, #24 + lsr r6, r6, #24 +#else + uxtb r6, r3 +#endif +#else ubfx r6, r3, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r3, #16 + lsr r7, r7, #24 +#else + uxtb r7, r3, ror #8 +#endif +#else ubfx r7, r3, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r3, #8 + lsr r8, r8, #24 +#else + uxtb r8, r3, ror #16 +#endif +#else ubfx r8, r3, #16, #8 +#endif lsr r9, r3, #24 ldrb r6, [r12, r6, lsl #2] ldrb r7, [r12, r7, lsl #2] @@ -633,9 +687,36 @@ L_AES_invert_key_mix_loop: eor r8, r8, r7, ror #8 eor r8, r8, r9, ror #24 str r8, [r0], #4 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r4, #24 + lsr r6, r6, #24 +#else + uxtb r6, r4 +#endif +#else ubfx r6, r4, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r4, #16 + lsr r7, r7, #24 +#else + uxtb r7, r4, ror #8 +#endif +#else ubfx r7, r4, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r4, #8 + lsr r8, r8, #24 +#else + uxtb r8, r4, ror #16 +#endif +#else ubfx r8, r4, #16, #8 +#endif lsr r9, r4, #24 ldrb r6, [r12, r6, lsl #2] ldrb r7, [r12, r7, lsl #2] @@ -649,9 +730,36 @@ L_AES_invert_key_mix_loop: eor r8, r8, r7, ror #8 eor r8, r8, r9, ror #24 str r8, [r0], #4 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r5, #24 + lsr r6, r6, #24 +#else + uxtb r6, r5 +#endif +#else ubfx r6, r5, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r5, #16 + lsr r7, r7, #24 +#else + uxtb r7, r5, ror #8 +#endif +#else ubfx r7, r5, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else ubfx r8, r5, #16, #8 +#endif lsr r9, r5, #24 ldrb r6, [r12, r6, lsl #2] ldrb r7, [r12, r7, lsl #2] @@ -698,46 +806,119 @@ AES_set_encrypt_key: beq L_AES_set_encrypt_key_start_128 cmp r1, #0xc0 beq L_AES_set_encrypt_key_start_192 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # REV r4, r4 + eor r3, r4, r4, ror #16 + bic r3, r3, #0xff0000 + ror r4, r4, #8 + eor r4, r4, r3, lsr #8 + # REV r5, r5 + eor r3, r5, r5, ror #16 + bic r3, r3, #0xff0000 + ror r5, r5, #8 + eor r5, r5, r3, lsr #8 + # REV r6, r6 + eor r3, r6, r6, ror #16 + bic r3, r3, #0xff0000 + ror r6, r6, #8 + eor r6, r6, r3, lsr #8 + # REV r7, r7 + eor r3, r7, r7, ror #16 + bic r3, r3, #0xff0000 + ror r7, r7, #8 + eor r7, r7, r3, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r2!, {r4, r5, r6, r7} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # REV r4, r4 + eor r3, r4, r4, ror #16 + bic r3, r3, #0xff0000 + ror r4, r4, #8 + eor r4, r4, r3, lsr #8 + # REV r5, r5 + eor r3, r5, r5, ror #16 + bic r3, r3, #0xff0000 + ror r5, r5, #8 + eor r5, r5, r3, lsr #8 + # REV r6, r6 + eor r3, r6, r6, ror #16 + bic r3, r3, #0xff0000 + ror r6, r6, #8 + eor r6, r6, r3, lsr #8 + # REV r7, r7 + eor r3, r7, r7, ror #16 + bic r3, r3, #0xff0000 + ror r7, r7, #8 + eor r7, r7, r3, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r2, {r4, r5, r6, r7} sub r2, r2, #16 mov r12, #6 L_AES_set_encrypt_key_loop_256: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r7, #24 + lsr r4, r4, #24 +#else + uxtb r4, r7 +#endif +#else ubfx r4, r7, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r7, #16 + lsr r5, r5, #24 +#else + uxtb r5, r7, ror #8 +#endif +#else ubfx r5, r7, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r7, #8 + lsr r6, r6, #24 +#else + uxtb r6, r7, ror #16 +#endif +#else ubfx r6, r7, #16, #8 +#endif lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] ldrb r5, [r8, r5, lsl #2] @@ -757,10 +938,37 @@ L_AES_set_encrypt_key_loop_256: stm r2, {r4, r5, r6, r7} sub r2, r2, #16 mov r3, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r3, #16 + lsr r4, r4, #24 +#else + uxtb r4, r3, ror #8 +#endif +#else ubfx r4, r3, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r3, #8 + lsr r5, r5, #24 +#else + uxtb r5, r3, ror #16 +#endif +#else ubfx r5, r3, #16, #8 +#endif lsr r6, r3, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r3, r3, #24 + lsr r3, r3, #24 +#else + uxtb r3, r3 +#endif +#else ubfx r3, r3, #0, #8 +#endif ldrb r4, [r8, r4, lsl #2] ldrb r6, [r8, r6, lsl #2] ldrb r5, [r8, r5, lsl #2] @@ -778,9 +986,36 @@ L_AES_set_encrypt_key_loop_256: sub r2, r2, #16 subs r12, r12, #1 bne L_AES_set_encrypt_key_loop_256 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r7, #24 + lsr r4, r4, #24 +#else + uxtb r4, r7 +#endif +#else ubfx r4, r7, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r7, #16 + lsr r5, r5, #24 +#else + uxtb r5, r7, ror #8 +#endif +#else ubfx r5, r7, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r7, #8 + lsr r6, r6, #24 +#else + uxtb r6, r7, ror #16 +#endif +#else ubfx r6, r7, #16, #8 +#endif lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] ldrb r5, [r8, r5, lsl #2] @@ -801,32 +1036,65 @@ L_AES_set_encrypt_key_loop_256: sub r2, r2, #16 b L_AES_set_encrypt_key_end L_AES_set_encrypt_key_start_192: -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r1, [r0, #20] ldr r0, [r0, #16] #else ldrd r0, r1, [r0, #16] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # REV r4, r4 + eor r3, r4, r4, ror #16 + bic r3, r3, #0xff0000 + ror r4, r4, #8 + eor r4, r4, r3, lsr #8 + # REV r5, r5 + eor r3, r5, r5, ror #16 + bic r3, r3, #0xff0000 + ror r5, r5, #8 + eor r5, r5, r3, lsr #8 + # REV r6, r6 + eor r3, r6, r6, ror #16 + bic r3, r3, #0xff0000 + ror r6, r6, #8 + eor r6, r6, r3, lsr #8 + # REV r7, r7 + eor r3, r7, r7, ror #16 + bic r3, r3, #0xff0000 + ror r7, r7, #8 + eor r7, r7, r3, lsr #8 + # REV r0, r0 + eor r3, r0, r0, ror #16 + bic r3, r3, #0xff0000 + ror r0, r0, #8 + eor r0, r0, r3, lsr #8 + # REV r1, r1 + eor r3, r1, r1, ror #16 + bic r3, r3, #0xff0000 + ror r1, r1, #8 + eor r1, r1, r3, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 rev r0, r0 rev r1, r1 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r2, {r4, r5, r6, r7} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r0, [r2, #16] str r1, [r2, #20] #else @@ -835,9 +1103,36 @@ L_AES_set_encrypt_key_start_192: mov r7, r1 mov r12, #7 L_AES_set_encrypt_key_loop_192: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r0, r7, #24 + lsr r0, r0, #24 +#else + uxtb r0, r7 +#endif +#else ubfx r0, r7, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r1, r7, #16 + lsr r1, r1, #24 +#else + uxtb r1, r7, ror #8 +#endif +#else ubfx r1, r7, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r7, #8 + lsr r4, r4, #24 +#else + uxtb r4, r7, ror #16 +#endif +#else ubfx r4, r7, #16, #8 +#endif lsr r7, r7, #24 ldrb r0, [r8, r0, lsl #2] ldrb r1, [r8, r1, lsl #2] @@ -858,9 +1153,36 @@ L_AES_set_encrypt_key_loop_192: stm r2, {r0, r1, r4, r5, r6, r7} subs r12, r12, #1 bne L_AES_set_encrypt_key_loop_192 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r0, r7, #24 + lsr r0, r0, #24 +#else + uxtb r0, r7 +#endif +#else ubfx r0, r7, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r1, r7, #16 + lsr r1, r1, #24 +#else + uxtb r1, r7, ror #8 +#endif +#else ubfx r1, r7, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r7, #8 + lsr r4, r4, #24 +#else + uxtb r4, r7, ror #16 +#endif +#else ubfx r4, r7, #16, #8 +#endif lsr r7, r7, #24 ldrb r0, [r8, r0, lsl #2] ldrb r1, [r8, r1, lsl #2] @@ -879,28 +1201,78 @@ L_AES_set_encrypt_key_loop_192: stm r2, {r0, r1, r4, r5} b L_AES_set_encrypt_key_end L_AES_set_encrypt_key_start_128: -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # REV r4, r4 + eor r3, r4, r4, ror #16 + bic r3, r3, #0xff0000 + ror r4, r4, #8 + eor r4, r4, r3, lsr #8 + # REV r5, r5 + eor r3, r5, r5, ror #16 + bic r3, r3, #0xff0000 + ror r5, r5, #8 + eor r5, r5, r3, lsr #8 + # REV r6, r6 + eor r3, r6, r6, ror #16 + bic r3, r3, #0xff0000 + ror r6, r6, #8 + eor r6, r6, r3, lsr #8 + # REV r7, r7 + eor r3, r7, r7, ror #16 + bic r3, r3, #0xff0000 + ror r7, r7, #8 + eor r7, r7, r3, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r2, {r4, r5, r6, r7} mov r12, #10 L_AES_set_encrypt_key_loop_128: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r7, #24 + lsr r4, r4, #24 +#else + uxtb r4, r7 +#endif +#else ubfx r4, r7, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r7, #16 + lsr r5, r5, #24 +#else + uxtb r5, r7, ror #8 +#endif +#else ubfx r5, r7, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r7, #8 + lsr r6, r6, #24 +#else + uxtb r6, r7, ror #16 +#endif +#else ubfx r6, r7, #16, #8 +#endif lsr r7, r7, #24 ldrb r4, [r8, r4, lsl #2] ldrb r5, [r8, r5, lsl #2] @@ -929,43 +1301,151 @@ L_AES_set_encrypt_key_end: AES_encrypt_block: push {lr} L_AES_encrypt_block_nr: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else ubfx r8, r5, #16, #8 +#endif lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else ubfx r2, r7, #0, #8 +#endif ldr r8, [r0, r8, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else ubfx r9, r6, #16, #8 +#endif eor r8, r8, r11, ror #24 lsr r11, r5, #24 eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else ubfx lr, r7, #8, #8 +#endif eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else ubfx r2, r4, #0, #8 +#endif ldr r9, [r0, r9, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else ubfx r10, r7, #16, #8 +#endif eor r9, r9, r11, ror #24 lsr r11, r6, #24 eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else ubfx lr, r4, #8, #8 +#endif eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else ubfx r2, r5, #0, #8 +#endif ldr r10, [r0, r10, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else ubfx r6, r6, #0, #8 +#endif eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else ubfx r11, r4, #16, #8 +#endif eor r10, r10, lr, ror #8 lsr lr, r7, #24 eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else ubfx r2, r5, #8, #8 +#endif ldr r6, [r0, r6, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r11, [r0, r11, lsl #2] @@ -979,43 +1459,151 @@ L_AES_encrypt_block_nr: eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r9, #8 + lsr r4, r4, #24 +#else + uxtb r4, r9, ror #16 +#endif +#else ubfx r4, r9, #16, #8 +#endif lsr r7, r8, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #16 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #8 +#endif +#else ubfx lr, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r11, #24 + lsr r2, r2, #24 +#else + uxtb r2, r11 +#endif +#else ubfx r2, r11, #0, #8 +#endif ldr r4, [r0, r4, lsl #2] ldr r7, [r0, r7, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r10, #8 + lsr r5, r5, #24 +#else + uxtb r5, r10, ror #16 +#endif +#else ubfx r5, r10, #16, #8 +#endif eor r4, r4, r7, ror #24 lsr r7, r9, #24 eor r4, r4, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #16 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #8 +#endif +#else ubfx lr, r11, #8, #8 +#endif eor r4, r4, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #24 + lsr r2, r2, #24 +#else + uxtb r2, r8 +#endif +#else ubfx r2, r8, #0, #8 +#endif ldr r5, [r0, r5, lsl #2] ldr r7, [r0, r7, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r11, #8 + lsr r6, r6, #24 +#else + uxtb r6, r11, ror #16 +#endif +#else ubfx r6, r11, #16, #8 +#endif eor r5, r5, r7, ror #24 lsr r7, r10, #24 eor r5, r5, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r8, #16 + lsr lr, lr, #24 +#else + uxtb lr, r8, ror #8 +#endif +#else ubfx lr, r8, #8, #8 +#endif eor r5, r5, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #24 + lsr r2, r2, #24 +#else + uxtb r2, r9 +#endif +#else ubfx r2, r9, #0, #8 +#endif ldr r6, [r0, r6, lsl #2] ldr r7, [r0, r7, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r10, #24 + lsr r10, r10, #24 +#else + uxtb r10, r10 +#endif +#else ubfx r10, r10, #0, #8 +#endif eor r6, r6, r7, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #8 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #16 +#endif +#else ubfx r7, r8, #16, #8 +#endif eor r6, r6, lr, ror #8 lsr lr, r11, #24 eor r6, r6, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r9, #16 + lsr r2, r2, #24 +#else + uxtb r2, r9, ror #8 +#endif +#else ubfx r2, r9, #8, #8 +#endif ldr r10, [r0, r10, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r7, [r0, r7, lsl #2] @@ -1031,43 +1619,151 @@ L_AES_encrypt_block_nr: eor r7, r7, r11 subs r1, r1, #1 bne L_AES_encrypt_block_nr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r5, #8 + lsr r8, r8, #24 +#else + uxtb r8, r5, ror #16 +#endif +#else ubfx r8, r5, #16, #8 +#endif lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #16 + lsr lr, lr, #24 +#else + uxtb lr, r6, ror #8 +#endif +#else ubfx lr, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r7, #24 + lsr r2, r2, #24 +#else + uxtb r2, r7 +#endif +#else ubfx r2, r7, #0, #8 +#endif ldr r8, [r0, r8, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r6, #8 + lsr r9, r9, #24 +#else + uxtb r9, r6, ror #16 +#endif +#else ubfx r9, r6, #16, #8 +#endif eor r8, r8, r11, ror #24 lsr r11, r5, #24 eor r8, r8, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #16 + lsr lr, lr, #24 +#else + uxtb lr, r7, ror #8 +#endif +#else ubfx lr, r7, #8, #8 +#endif eor r8, r8, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r4, #24 + lsr r2, r2, #24 +#else + uxtb r2, r4 +#endif +#else ubfx r2, r4, #0, #8 +#endif ldr r9, [r0, r9, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r7, #8 + lsr r10, r10, #24 +#else + uxtb r10, r7, ror #16 +#endif +#else ubfx r10, r7, #16, #8 +#endif eor r9, r9, r11, ror #24 lsr r11, r6, #24 eor r9, r9, lr, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r4, #16 + lsr lr, lr, #24 +#else + uxtb lr, r4, ror #8 +#endif +#else ubfx lr, r4, #8, #8 +#endif eor r9, r9, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #24 + lsr r2, r2, #24 +#else + uxtb r2, r5 +#endif +#else ubfx r2, r5, #0, #8 +#endif ldr r10, [r0, r10, lsl #2] ldr r11, [r0, r11, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r6, #24 + lsr r6, r6, #24 +#else + uxtb r6, r6 +#endif +#else ubfx r6, r6, #0, #8 +#endif eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r4, #8 + lsr r11, r11, #24 +#else + uxtb r11, r4, ror #16 +#endif +#else ubfx r11, r4, #16, #8 +#endif eor r10, r10, lr, ror #8 lsr lr, r7, #24 eor r10, r10, r2, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r5, #16 + lsr r2, r2, #24 +#else + uxtb r2, r5, ror #8 +#endif +#else ubfx r2, r5, #8, #8 +#endif ldr r6, [r0, r6, lsl #2] ldr lr, [r0, lr, lsl #2] ldr r11, [r0, r11, lsl #2] @@ -1081,30 +1777,111 @@ L_AES_encrypt_block_nr: eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r11, #24 + lsr r4, r4, #24 +#else + uxtb r4, r11 +#endif +#else ubfx r4, r11, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #16 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #8 +#endif +#else ubfx r7, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #8 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #16 +#endif +#else ubfx lr, r9, #16, #8 +#endif lsr r2, r8, #24 ldrb r4, [r0, r4, lsl #2] ldrb r7, [r0, r7, lsl #2] ldrb lr, [r0, lr, lsl #2] ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r8, #24 + lsr r5, r5, #24 +#else + uxtb r5, r8 +#endif +#else ubfx r5, r8, #0, #8 +#endif eor r4, r4, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r11, #16 + lsr r7, r7, #24 +#else + uxtb r7, r11, ror #8 +#endif +#else ubfx r7, r11, #8, #8 +#endif eor r4, r4, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #8 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #16 +#endif +#else ubfx lr, r10, #16, #8 +#endif eor r4, r4, r2, lsl #24 lsr r2, r9, #24 ldrb r5, [r0, r5, lsl #2] ldrb r7, [r0, r7, lsl #2] ldrb lr, [r0, lr, lsl #2] ldrb r2, [r0, r2, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r9, #24 + lsr r6, r6, #24 +#else + uxtb r6, r9 +#endif +#else ubfx r6, r9, #0, #8 +#endif eor r5, r5, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #16 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #8 +#endif +#else ubfx r7, r8, #8, #8 +#endif eor r5, r5, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #8 + lsr lr, lr, #24 +#else + uxtb lr, r11, ror #16 +#endif +#else ubfx lr, r11, #16, #8 +#endif eor r5, r5, r2, lsl #24 lsr r2, r10, #24 ldrb r6, [r0, r6, lsl #2] @@ -1113,11 +1890,38 @@ L_AES_encrypt_block_nr: ldrb r2, [r0, r2, lsl #2] lsr r11, r11, #24 eor r6, r6, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #24 + lsr r7, r7, #24 +#else + uxtb r7, r10 +#endif +#else ubfx r7, r10, #0, #8 +#endif eor r6, r6, lr, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #16 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #8 +#endif +#else ubfx lr, r9, #8, #8 +#endif eor r6, r6, r2, lsl #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r8, #8 + lsr r2, r2, #24 +#else + uxtb r2, r8, ror #16 +#endif +#else ubfx r2, r8, #16, #8 +#endif ldrb r11, [r0, r11, lsl #2] ldrb r7, [r0, r7, lsl #2] ldrb lr, [r0, lr, lsl #2] @@ -1133,15 +1937,13 @@ L_AES_encrypt_block_nr: eor r7, r7, r11 pop {pc} .size AES_encrypt_block,.-AES_encrypt_block -#if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text .type L_AES_ARM32_te_ecb, %object .size L_AES_ARM32_te_ecb, 12 .align 4 L_AES_ARM32_te_ecb: .word L_AES_ARM32_te_data -#endif /* HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ -#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text .align 4 .globl AES_ECB_encrypt @@ -1162,10 +1964,29 @@ L_AES_ECB_encrypt_loop_block_256: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -1177,10 +1998,29 @@ L_AES_ECB_encrypt_loop_block_256: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1196,10 +2036,29 @@ L_AES_ECB_encrypt_loop_block_192: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -1211,10 +2070,29 @@ L_AES_ECB_encrypt_loop_block_192: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1230,10 +2108,29 @@ L_AES_ECB_encrypt_loop_block_128: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r2, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -1245,10 +2142,29 @@ L_AES_ECB_encrypt_loop_block_128: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1263,6 +2179,12 @@ L_AES_ECB_encrypt_end: .size AES_ECB_encrypt,.-AES_ECB_encrypt #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC + .text + .type L_AES_ARM32_te_cbc, %object + .size L_AES_ARM32_te_cbc, 12 + .align 4 +L_AES_ARM32_te_cbc: + .word L_AES_ARM32_te_data .text .align 4 .globl AES_CBC_encrypt @@ -1272,7 +2194,7 @@ AES_CBC_encrypt: ldr r8, [sp, #36] ldr r9, [sp, #40] mov lr, r0 - adr r0, L_AES_ARM32_te_ecb + adr r0, L_AES_ARM32_te_cbc ldr r0, [r0] ldm r9, {r4, r5, r6, r7} push {r3, r9} @@ -1290,11 +2212,30 @@ L_AES_CBC_encrypt_loop_block_256: eor r6, r6, r10 eor r7, r7, r11 push {r1, r2, lr} - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -1304,10 +2245,29 @@ L_AES_CBC_encrypt_loop_block_256: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1328,11 +2288,30 @@ L_AES_CBC_encrypt_loop_block_192: eor r6, r6, r10 eor r7, r7, r11 push {r1, r2, lr} - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -1342,10 +2321,29 @@ L_AES_CBC_encrypt_loop_block_192: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1366,11 +2364,30 @@ L_AES_CBC_encrypt_loop_block_128: eor r6, r6, r10 eor r7, r7, r11 push {r1, r2, lr} - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -1380,10 +2397,29 @@ L_AES_CBC_encrypt_loop_block_128: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -1399,6 +2435,12 @@ L_AES_CBC_encrypt_end: .size AES_CBC_encrypt,.-AES_CBC_encrypt #endif /* HAVE_AES_CBC */ #ifdef WOLFSSL_AES_COUNTER + .text + .type L_AES_ARM32_te_ctr, %object + .size L_AES_ARM32_te_ctr, 12 + .align 4 +L_AES_ARM32_te_ctr: + .word L_AES_ARM32_te_data .text .align 4 .globl AES_CTR_encrypt @@ -1408,13 +2450,32 @@ AES_CTR_encrypt: ldr r12, [sp, #36] ldr r8, [sp, #40] mov lr, r0 - adr r0, L_AES_ARM32_te_ecb + adr r0, L_AES_ARM32_te_ctr ldr r0, [r0] ldm r8, {r4, r5, r6, r7} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r8, {r4, r5, r6, r7} push {r3, r8} cmp r12, #10 @@ -1439,10 +2500,29 @@ L_AES_CTR_encrypt_loop_block_256: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -1481,10 +2561,29 @@ L_AES_CTR_encrypt_loop_block_192: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -1523,10 +2622,29 @@ L_AES_CTR_encrypt_loop_block_128: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -1547,10 +2665,29 @@ L_AES_CTR_encrypt_loop_block_128: bne L_AES_CTR_encrypt_loop_block_128 L_AES_CTR_encrypt_end: pop {r3, r8} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r8, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_CTR_encrypt,.-AES_CTR_encrypt @@ -1564,43 +2701,151 @@ L_AES_CTR_encrypt_end: AES_decrypt_block: push {lr} L_AES_decrypt_block_nr: +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r7, #8 + lsr r8, r8, #24 +#else + uxtb r8, r7, ror #16 +#endif +#else ubfx r8, r7, #16, #8 +#endif lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r6, #16 + lsr r12, r12, #24 +#else + uxtb r12, r6, ror #8 +#endif +#else ubfx r12, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r5, #24 + lsr lr, lr, #24 +#else + uxtb lr, r5 +#endif +#else ubfx lr, r5, #0, #8 +#endif ldr r8, [r0, r8, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r4, #8 + lsr r9, r9, #24 +#else + uxtb r9, r4, ror #16 +#endif +#else ubfx r9, r4, #16, #8 +#endif eor r8, r8, r11, ror #24 lsr r11, r5, #24 eor r8, r8, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r7, #16 + lsr r12, r12, #24 +#else + uxtb r12, r7, ror #8 +#endif +#else ubfx r12, r7, #8, #8 +#endif eor r8, r8, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #24 + lsr lr, lr, #24 +#else + uxtb lr, r6 +#endif +#else ubfx lr, r6, #0, #8 +#endif ldr r9, [r0, r9, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r5, #8 + lsr r10, r10, #24 +#else + uxtb r10, r5, ror #16 +#endif +#else ubfx r10, r5, #16, #8 +#endif eor r9, r9, r11, ror #24 lsr r11, r6, #24 eor r9, r9, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r4, #16 + lsr r12, r12, #24 +#else + uxtb r12, r4, ror #8 +#endif +#else ubfx r12, r4, #8, #8 +#endif eor r9, r9, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #24 + lsr lr, lr, #24 +#else + uxtb lr, r7 +#endif +#else ubfx lr, r7, #0, #8 +#endif ldr r10, [r0, r10, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r4, #24 + lsr r4, r4, #24 +#else + uxtb r4, r4 +#endif +#else ubfx r4, r4, #0, #8 +#endif eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r6, #8 + lsr r11, r11, #24 +#else + uxtb r11, r6, ror #16 +#endif +#else ubfx r11, r6, #16, #8 +#endif eor r10, r10, r12, ror #8 lsr r12, r7, #24 eor r10, r10, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r5, #16 + lsr lr, lr, #24 +#else + uxtb lr, r5, ror #8 +#endif +#else ubfx lr, r5, #8, #8 +#endif ldr r4, [r0, r4, lsl #2] ldr r12, [r0, r12, lsl #2] ldr r11, [r0, r11, lsl #2] @@ -1614,43 +2859,151 @@ L_AES_decrypt_block_nr: eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r11, #8 + lsr r4, r4, #24 +#else + uxtb r4, r11, ror #16 +#endif +#else ubfx r4, r11, #16, #8 +#endif lsr r7, r8, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r10, #16 + lsr r12, r12, #24 +#else + uxtb r12, r10, ror #8 +#endif +#else ubfx r12, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #24 + lsr lr, lr, #24 +#else + uxtb lr, r9 +#endif +#else ubfx lr, r9, #0, #8 +#endif ldr r4, [r0, r4, lsl #2] ldr r7, [r0, r7, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r8, #8 + lsr r5, r5, #24 +#else + uxtb r5, r8, ror #16 +#endif +#else ubfx r5, r8, #16, #8 +#endif eor r4, r4, r7, ror #24 lsr r7, r9, #24 eor r4, r4, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r11, #16 + lsr r12, r12, #24 +#else + uxtb r12, r11, ror #8 +#endif +#else ubfx r12, r11, #8, #8 +#endif eor r4, r4, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #24 + lsr lr, lr, #24 +#else + uxtb lr, r10 +#endif +#else ubfx lr, r10, #0, #8 +#endif ldr r5, [r0, r5, lsl #2] ldr r7, [r0, r7, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r9, #8 + lsr r6, r6, #24 +#else + uxtb r6, r9, ror #16 +#endif +#else ubfx r6, r9, #16, #8 +#endif eor r5, r5, r7, ror #24 lsr r7, r10, #24 eor r5, r5, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r8, #16 + lsr r12, r12, #24 +#else + uxtb r12, r8, ror #8 +#endif +#else ubfx r12, r8, #8, #8 +#endif eor r5, r5, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r11, #24 + lsr lr, lr, #24 +#else + uxtb lr, r11 +#endif +#else ubfx lr, r11, #0, #8 +#endif ldr r6, [r0, r6, lsl #2] ldr r7, [r0, r7, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r8, #24 + lsr r8, r8, #24 +#else + uxtb r8, r8 +#endif +#else ubfx r8, r8, #0, #8 +#endif eor r6, r6, r7, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #8 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #16 +#endif +#else ubfx r7, r10, #16, #8 +#endif eor r6, r6, r12, ror #8 lsr r12, r11, #24 eor r6, r6, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r9, #16 + lsr lr, lr, #24 +#else + uxtb lr, r9, ror #8 +#endif +#else ubfx lr, r9, #8, #8 +#endif ldr r8, [r0, r8, lsl #2] ldr r12, [r0, r12, lsl #2] ldr r7, [r0, r7, lsl #2] @@ -1666,43 +3019,151 @@ L_AES_decrypt_block_nr: eor r7, r7, r11 subs r1, r1, #1 bne L_AES_decrypt_block_nr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r8, r7, #8 + lsr r8, r8, #24 +#else + uxtb r8, r7, ror #16 +#endif +#else ubfx r8, r7, #16, #8 +#endif lsr r11, r4, #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r6, #16 + lsr r12, r12, #24 +#else + uxtb r12, r6, ror #8 +#endif +#else ubfx r12, r6, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r5, #24 + lsr lr, lr, #24 +#else + uxtb lr, r5 +#endif +#else ubfx lr, r5, #0, #8 +#endif ldr r8, [r0, r8, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r9, r4, #8 + lsr r9, r9, #24 +#else + uxtb r9, r4, ror #16 +#endif +#else ubfx r9, r4, #16, #8 +#endif eor r8, r8, r11, ror #24 lsr r11, r5, #24 eor r8, r8, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r7, #16 + lsr r12, r12, #24 +#else + uxtb r12, r7, ror #8 +#endif +#else ubfx r12, r7, #8, #8 +#endif eor r8, r8, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r6, #24 + lsr lr, lr, #24 +#else + uxtb lr, r6 +#endif +#else ubfx lr, r6, #0, #8 +#endif ldr r9, [r0, r9, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r10, r5, #8 + lsr r10, r10, #24 +#else + uxtb r10, r5, ror #16 +#endif +#else ubfx r10, r5, #16, #8 +#endif eor r9, r9, r11, ror #24 lsr r11, r6, #24 eor r9, r9, r12, ror #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r4, #16 + lsr r12, r12, #24 +#else + uxtb r12, r4, ror #8 +#endif +#else ubfx r12, r4, #8, #8 +#endif eor r9, r9, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r7, #24 + lsr lr, lr, #24 +#else + uxtb lr, r7 +#endif +#else ubfx lr, r7, #0, #8 +#endif ldr r10, [r0, r10, lsl #2] ldr r11, [r0, r11, lsl #2] ldr r12, [r0, r12, lsl #2] ldr lr, [r0, lr, lsl #2] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r4, #24 + lsr r4, r4, #24 +#else + uxtb r4, r4 +#endif +#else ubfx r4, r4, #0, #8 +#endif eor r10, r10, r11, ror #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r11, r6, #8 + lsr r11, r11, #24 +#else + uxtb r11, r6, ror #16 +#endif +#else ubfx r11, r6, #16, #8 +#endif eor r10, r10, r12, ror #8 lsr r12, r7, #24 eor r10, r10, lr, ror #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r5, #16 + lsr lr, lr, #24 +#else + uxtb lr, r5, ror #8 +#endif +#else ubfx lr, r5, #8, #8 +#endif ldr r4, [r0, r4, lsl #2] ldr r12, [r0, r12, lsl #2] ldr r11, [r0, r11, lsl #2] @@ -1716,30 +3177,111 @@ L_AES_decrypt_block_nr: eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r4, r9, #24 + lsr r4, r4, #24 +#else + uxtb r4, r9 +#endif +#else ubfx r4, r9, #0, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r10, #16 + lsr r7, r7, #24 +#else + uxtb r7, r10, ror #8 +#endif +#else ubfx r7, r10, #8, #8 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r11, #8 + lsr r12, r12, #24 +#else + uxtb r12, r11, ror #16 +#endif +#else ubfx r12, r11, #16, #8 +#endif lsr lr, r8, #24 ldrb r4, [r2, r4] ldrb r7, [r2, r7] ldrb r12, [r2, r12] ldrb lr, [r2, lr] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r5, r10, #24 + lsr r5, r5, #24 +#else + uxtb r5, r10 +#endif +#else ubfx r5, r10, #0, #8 +#endif eor r4, r4, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r11, #16 + lsr r7, r7, #24 +#else + uxtb r7, r11, ror #8 +#endif +#else ubfx r7, r11, #8, #8 +#endif eor r4, r4, r12, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r8, #8 + lsr r12, r12, #24 +#else + uxtb r12, r8, ror #16 +#endif +#else ubfx r12, r8, #16, #8 +#endif eor r4, r4, lr, lsl #24 lsr lr, r9, #24 ldrb r7, [r2, r7] ldrb lr, [r2, lr] ldrb r5, [r2, r5] ldrb r12, [r2, r12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r6, r11, #24 + lsr r6, r6, #24 +#else + uxtb r6, r11 +#endif +#else ubfx r6, r11, #0, #8 +#endif eor r5, r5, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #16 + lsr r7, r7, #24 +#else + uxtb r7, r8, ror #8 +#endif +#else ubfx r7, r8, #8, #8 +#endif eor r5, r5, r12, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r9, #8 + lsr r12, r12, #24 +#else + uxtb r12, r9, ror #16 +#endif +#else ubfx r12, r9, #16, #8 +#endif eor r5, r5, lr, lsl #24 lsr lr, r10, #24 ldrb r7, [r2, r7] @@ -1748,11 +3290,38 @@ L_AES_decrypt_block_nr: ldrb r12, [r2, r12] lsr r11, r11, #24 eor r6, r6, r7, lsl #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r7, r8, #24 + lsr r7, r7, #24 +#else + uxtb r7, r8 +#endif +#else ubfx r7, r8, #0, #8 +#endif eor r6, r6, r12, lsl #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r12, r9, #16 + lsr r12, r12, #24 +#else + uxtb r12, r9, ror #8 +#endif +#else ubfx r12, r9, #8, #8 +#endif eor r6, r6, lr, lsl #24 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl lr, r10, #8 + lsr lr, lr, #24 +#else + uxtb lr, r10, ror #16 +#endif +#else ubfx lr, r10, #16, #8 +#endif ldrb r11, [r2, r11] ldrb r12, [r2, r12] ldrb r7, [r2, r7] @@ -2057,10 +3626,29 @@ L_AES_ECB_decrypt_loop_block_256: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r3, r12, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -2071,10 +3659,29 @@ L_AES_ECB_decrypt_loop_block_256: mov r1, #6 bl AES_decrypt_block pop {r1, r3, r12, lr} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -2090,10 +3697,29 @@ L_AES_ECB_decrypt_loop_block_192: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r3, r12, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -2104,10 +3730,29 @@ L_AES_ECB_decrypt_loop_block_192: mov r1, #5 bl AES_decrypt_block pop {r1, r3, r12, lr} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -2123,10 +3768,29 @@ L_AES_ECB_decrypt_loop_block_128: ldr r5, [lr, #4] ldr r6, [lr, #8] ldr r7, [lr, #12] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ push {r1, r3, r12, lr} ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule @@ -2137,10 +3801,29 @@ L_AES_ECB_decrypt_loop_block_128: mov r1, #4 bl AES_decrypt_block pop {r1, r3, r12, lr} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] @@ -2179,23 +3862,42 @@ L_AES_CBC_decrypt_loop_block_256: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr, #16] str r5, [lr, #20] #else strd r4, r5, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #24] str r7, [lr, #28] #else strd r6, r7, [lr, #24] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2204,10 +3906,29 @@ L_AES_CBC_decrypt_loop_block_256: mov r1, #6 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldm lr, {r8, r9, r10, r11} pop {r1, r12, lr} ldr r3, [sp] @@ -2229,23 +3950,42 @@ L_AES_CBC_decrypt_loop_block_256: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr] str r5, [lr, #4] #else strd r4, r5, [lr] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #8] str r7, [lr, #12] #else strd r6, r7, [lr, #8] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2254,17 +3994,36 @@ L_AES_CBC_decrypt_loop_block_256: mov r1, #6 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [lr, #16] ldr r9, [lr, #20] #else ldrd r8, r9, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [lr, #24] ldr r11, [lr, #28] #else @@ -2292,23 +4051,42 @@ L_AES_CBC_decrypt_loop_block_192: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr, #16] str r5, [lr, #20] #else strd r4, r5, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #24] str r7, [lr, #28] #else strd r6, r7, [lr, #24] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2317,10 +4095,29 @@ L_AES_CBC_decrypt_loop_block_192: mov r1, #5 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldm lr, {r8, r9, r10, r11} pop {r1, r12, lr} ldr r3, [sp] @@ -2342,23 +4139,42 @@ L_AES_CBC_decrypt_loop_block_192: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr] str r5, [lr, #4] #else strd r4, r5, [lr] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #8] str r7, [lr, #12] #else strd r6, r7, [lr, #8] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2367,17 +4183,36 @@ L_AES_CBC_decrypt_loop_block_192: mov r1, #5 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [lr, #16] ldr r9, [lr, #20] #else ldrd r8, r9, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [lr, #24] ldr r11, [lr, #28] #else @@ -2405,23 +4240,42 @@ L_AES_CBC_decrypt_loop_block_128: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr, #16] str r5, [lr, #20] #else strd r4, r5, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #24] str r7, [lr, #28] #else strd r6, r7, [lr, #24] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2430,10 +4284,29 @@ L_AES_CBC_decrypt_loop_block_128: mov r1, #4 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldm lr, {r8, r9, r10, r11} pop {r1, r12, lr} ldr r3, [sp] @@ -2455,23 +4328,42 @@ L_AES_CBC_decrypt_loop_block_128: ldr r6, [lr, #8] ldr r7, [lr, #12] ldr lr, [sp, #16] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [lr] str r5, [lr, #4] #else strd r4, r5, [lr] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [lr, #8] str r7, [lr, #12] #else strd r6, r7, [lr, #8] #endif - ldm r3!, {r8, r9, r10, r11} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + ldm r3!, {r8, r9, r10, r11} # Round: 0 - XOR in key schedule eor r4, r4, r8 eor r5, r5, r9 @@ -2480,17 +4372,36 @@ L_AES_CBC_decrypt_loop_block_128: mov r1, #4 bl AES_decrypt_block ldr lr, [sp, #16] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [lr, #16] ldr r9, [lr, #20] #else ldrd r8, r9, [lr, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [lr, #24] ldr r11, [lr, #28] #else @@ -2513,25 +4424,25 @@ L_AES_CBC_decrypt_loop_block_128: b L_AES_CBC_decrypt_end L_AES_CBC_decrypt_end_odd: ldr r4, [sp, #4] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r4, #16] ldr r9, [r4, #20] #else ldrd r8, r9, [r4, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r4, #24] ldr r11, [r4, #28] #else ldrd r10, r11, [r4, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r4] str r9, [r4, #4] #else strd r8, r9, [r4] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [r4, #8] str r11, [r4, #12] #else @@ -3110,10 +5021,33 @@ L_GCM_gmult_len_start_block: eor r9, r9, r5 eor r10, r10, r6 eor r11, r11, r7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # REV r8, r8 + eor r3, r8, r8, ror #16 + bic r3, r3, #0xff0000 + ror r8, r8, #8 + eor r8, r8, r3, lsr #8 + # REV r9, r9 + eor r3, r9, r9, ror #16 + bic r3, r3, #0xff0000 + ror r9, r9, #8 + eor r9, r9, r3, lsr #8 + # REV r10, r10 + eor r3, r10, r10, ror #16 + bic r3, r3, #0xff0000 + ror r10, r10, #8 + eor r10, r10, r3, lsr #8 + # REV r11, r11 + eor r3, r11, r11, ror #16 + bic r3, r3, #0xff0000 + ror r11, r11, #8 + eor r11, r11, r3, lsr #8 +#else rev r8, r8 rev r9, r9 rev r10, r10 rev r11, r11 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r0, {r8, r9, r10, r11} pop {r3} subs r3, r3, #16 @@ -3139,10 +5073,29 @@ AES_GCM_encrypt: adr r0, L_AES_ARM32_te_gcm ldr r0, [r0] ldm r8, {r4, r5, r6, r7} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r8, {r4, r5, r6, r7} push {r3, r8} cmp r12, #10 @@ -3164,10 +5117,29 @@ L_AES_GCM_encrypt_loop_block_256: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -3203,10 +5175,29 @@ L_AES_GCM_encrypt_loop_block_192: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -3242,10 +5233,29 @@ L_AES_GCM_encrypt_loop_block_128: bl AES_encrypt_block pop {r1, r2, lr} ldr r3, [sp] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r8, [lr] ldr r9, [lr, #4] ldr r10, [lr, #8] @@ -3266,10 +5276,29 @@ L_AES_GCM_encrypt_loop_block_128: bne L_AES_GCM_encrypt_loop_block_128 L_AES_GCM_encrypt_end: pop {r3, r8} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + eor r10, r4, r4, ror #16 + eor r11, r5, r5, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + eor r4, r4, r10, lsr #8 + eor r5, r5, r11, lsr #8 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r6, r6, #8 + ror r7, r7, #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#else rev r4, r4 rev r5, r5 rev r6, r6 rev r7, r7 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r8, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size AES_GCM_encrypt,.-AES_GCM_encrypt diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index a2da62598..b6d07ed9d 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -39,6 +39,18 @@ #include #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifndef NO_AES #include @@ -215,9 +227,36 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "\n" "L_AES_invert_key_mix_loop_%=: \n\t" "ldm %[ks], {r2, r3, r4, r5}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r2, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r2\n\t" +#endif +#else "ubfx r6, r2, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r2, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r2, ror #8\n\t" +#endif +#else "ubfx r7, r2, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r2, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r2, ror #16\n\t" +#endif +#else "ubfx r8, r2, #16, #8\n\t" +#endif "lsr r9, r2, #24\n\t" "ldrb r6, [r12, r6, lsl #2]\n\t" "ldrb r7, [r12, r7, lsl #2]\n\t" @@ -231,9 +270,36 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "eor r8, r8, r7, ror #8\n\t" "eor r8, r8, r9, ror #24\n\t" "str r8, [%[ks]], #4\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r3, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r3\n\t" +#endif +#else "ubfx r6, r3, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r3, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r3, ror #8\n\t" +#endif +#else "ubfx r7, r3, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r3, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r3, ror #16\n\t" +#endif +#else "ubfx r8, r3, #16, #8\n\t" +#endif "lsr r9, r3, #24\n\t" "ldrb r6, [r12, r6, lsl #2]\n\t" "ldrb r7, [r12, r7, lsl #2]\n\t" @@ -247,9 +313,36 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "eor r8, r8, r7, ror #8\n\t" "eor r8, r8, r9, ror #24\n\t" "str r8, [%[ks]], #4\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r4, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r4\n\t" +#endif +#else "ubfx r6, r4, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r4, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r4, ror #8\n\t" +#endif +#else "ubfx r7, r4, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r4, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r4, ror #16\n\t" +#endif +#else "ubfx r8, r4, #16, #8\n\t" +#endif "lsr r9, r4, #24\n\t" "ldrb r6, [r12, r6, lsl #2]\n\t" "ldrb r7, [r12, r7, lsl #2]\n\t" @@ -263,9 +356,36 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "eor r8, r8, r7, ror #8\n\t" "eor r8, r8, r9, ror #24\n\t" "str r8, [%[ks]], #4\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r5, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r5\n\t" +#endif +#else "ubfx r6, r5, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r5, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r5, ror #8\n\t" +#endif +#else "ubfx r7, r5, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else "ubfx r8, r5, #16, #8\n\t" +#endif "lsr r9, r5, #24\n\t" "ldrb r6, [r12, r6, lsl #2]\n\t" "ldrb r7, [r12, r7, lsl #2]\n\t" @@ -310,47 +430,120 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "beq L_AES_set_encrypt_key_start_128_%=\n\t" "cmp %[len], #0xc0\n\t" "beq L_AES_set_encrypt_key_start_192_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[key]]\n\t" "ldr r5, [%[key], #4]\n\t" #else "ldrd r4, r5, [%[key]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" #else "ldrd r6, r7, [%[key], #8]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* REV r4, r4 */ + "eor r3, r4, r4, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "eor r4, r4, r3, lsr #8\n\t" + /* REV r5, r5 */ + "eor r3, r5, r5, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r5, r5, #8\n\t" + "eor r5, r5, r3, lsr #8\n\t" + /* REV r6, r6 */ + "eor r3, r6, r6, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "eor r6, r6, r3, lsr #8\n\t" + /* REV r7, r7 */ + "eor r3, r7, r7, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r7, r7, #8\n\t" + "eor r7, r7, r3, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[ks]!, {r4, r5, r6, r7}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[key], #16]\n\t" "ldr r5, [%[key], #20]\n\t" #else "ldrd r4, r5, [%[key], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[key], #24]\n\t" "ldr r7, [%[key], #28]\n\t" #else "ldrd r6, r7, [%[key], #24]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* REV r4, r4 */ + "eor r3, r4, r4, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "eor r4, r4, r3, lsr #8\n\t" + /* REV r5, r5 */ + "eor r3, r5, r5, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r5, r5, #8\n\t" + "eor r5, r5, r3, lsr #8\n\t" + /* REV r6, r6 */ + "eor r3, r6, r6, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "eor r6, r6, r3, lsr #8\n\t" + /* REV r7, r7 */ + "eor r3, r7, r7, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r7, r7, #8\n\t" + "eor r7, r7, r3, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[ks], {r4, r5, r6, r7}\n\t" "sub %[ks], %[ks], #16\n\t" "mov r12, #6\n\t" "\n" "L_AES_set_encrypt_key_loop_256_%=: \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r7, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r7\n\t" +#endif +#else "ubfx r4, r7, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r7, #16\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r7, ror #8\n\t" +#endif +#else "ubfx r5, r7, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r7, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r7, ror #16\n\t" +#endif +#else "ubfx r6, r7, #16, #8\n\t" +#endif "lsr r7, r7, #24\n\t" "ldrb r4, [r8, r4, lsl #2]\n\t" "ldrb r5, [r8, r5, lsl #2]\n\t" @@ -370,10 +563,37 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "stm %[ks], {r4, r5, r6, r7}\n\t" "sub %[ks], %[ks], #16\n\t" "mov r3, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r3, #16\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r3, ror #8\n\t" +#endif +#else "ubfx r4, r3, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r3, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r3, ror #16\n\t" +#endif +#else "ubfx r5, r3, #16, #8\n\t" +#endif "lsr r6, r3, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r3, r3, #24\n\t" + "lsr r3, r3, #24\n\t" +#else + "uxtb r3, r3\n\t" +#endif +#else "ubfx r3, r3, #0, #8\n\t" +#endif "ldrb r4, [r8, r4, lsl #2]\n\t" "ldrb r6, [r8, r6, lsl #2]\n\t" "ldrb r5, [r8, r5, lsl #2]\n\t" @@ -391,9 +611,36 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "sub %[ks], %[ks], #16\n\t" "subs r12, r12, #1\n\t" "bne L_AES_set_encrypt_key_loop_256_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r7, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r7\n\t" +#endif +#else "ubfx r4, r7, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r7, #16\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r7, ror #8\n\t" +#endif +#else "ubfx r5, r7, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r7, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r7, ror #16\n\t" +#endif +#else "ubfx r6, r7, #16, #8\n\t" +#endif "lsr r7, r7, #24\n\t" "ldrb r4, [r8, r4, lsl #2]\n\t" "ldrb r5, [r8, r5, lsl #2]\n\t" @@ -415,32 +662,65 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "b L_AES_set_encrypt_key_end_%=\n\t" "\n" "L_AES_set_encrypt_key_start_192_%=: \n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[key]]\n\t" "ldr r5, [%[key], #4]\n\t" #else "ldrd r4, r5, [%[key]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" #else "ldrd r6, r7, [%[key], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "ldr %[key], [%[key], #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[len], [%[key], #20]\n\t" + "ldr %[key], [%[key], #16]\n\t" #else "ldrd %[key], %[len], [%[key], #16]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* REV r4, r4 */ + "eor r3, r4, r4, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "eor r4, r4, r3, lsr #8\n\t" + /* REV r5, r5 */ + "eor r3, r5, r5, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r5, r5, #8\n\t" + "eor r5, r5, r3, lsr #8\n\t" + /* REV r6, r6 */ + "eor r3, r6, r6, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "eor r6, r6, r3, lsr #8\n\t" + /* REV r7, r7 */ + "eor r3, r7, r7, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r7, r7, #8\n\t" + "eor r7, r7, r3, lsr #8\n\t" + /* REV r0, r0 */ + "eor r3, %[key], %[key], ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror %[key], %[key], #8\n\t" + "eor %[key], %[key], r3, lsr #8\n\t" + /* REV r1, r1 */ + "eor r3, %[len], %[len], ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror %[len], %[len], #8\n\t" + "eor %[len], %[len], r3, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" "rev %[key], %[key]\n\t" "rev %[len], %[len]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[ks], {r4, r5, r6, r7}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str %[key], [%[ks], #16]\n\t" "str %[len], [%[ks], #20]\n\t" #else @@ -450,9 +730,36 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "mov r12, #7\n\t" "\n" "L_AES_set_encrypt_key_loop_192_%=: \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r0, r7, #24\n\t" + "lsr r0, r0, #24\n\t" +#else + "uxtb r0, r7\n\t" +#endif +#else "ubfx r0, r7, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r1, r7, #16\n\t" + "lsr r1, r1, #24\n\t" +#else + "uxtb r1, r7, ror #8\n\t" +#endif +#else "ubfx r1, r7, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r7, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r7, ror #16\n\t" +#endif +#else "ubfx r4, r7, #16, #8\n\t" +#endif "lsr r7, r7, #24\n\t" "ldrb r0, [r8, r0, lsl #2]\n\t" "ldrb r1, [r8, r1, lsl #2]\n\t" @@ -473,9 +780,36 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "stm %[ks], {r0, r1, r4, r5, r6, r7}\n\t" "subs r12, r12, #1\n\t" "bne L_AES_set_encrypt_key_loop_192_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r0, r7, #24\n\t" + "lsr r0, r0, #24\n\t" +#else + "uxtb r0, r7\n\t" +#endif +#else "ubfx r0, r7, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r1, r7, #16\n\t" + "lsr r1, r1, #24\n\t" +#else + "uxtb r1, r7, ror #8\n\t" +#endif +#else "ubfx r1, r7, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r7, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r7, ror #16\n\t" +#endif +#else "ubfx r4, r7, #16, #8\n\t" +#endif "lsr r7, r7, #24\n\t" "ldrb r0, [r8, r0, lsl #2]\n\t" "ldrb r1, [r8, r1, lsl #2]\n\t" @@ -495,29 +829,79 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "b L_AES_set_encrypt_key_end_%=\n\t" "\n" "L_AES_set_encrypt_key_start_128_%=: \n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[key]]\n\t" "ldr r5, [%[key], #4]\n\t" #else "ldrd r4, r5, [%[key]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" #else "ldrd r6, r7, [%[key], #8]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* REV r4, r4 */ + "eor r3, r4, r4, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "eor r4, r4, r3, lsr #8\n\t" + /* REV r5, r5 */ + "eor r3, r5, r5, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r5, r5, #8\n\t" + "eor r5, r5, r3, lsr #8\n\t" + /* REV r6, r6 */ + "eor r3, r6, r6, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "eor r6, r6, r3, lsr #8\n\t" + /* REV r7, r7 */ + "eor r3, r7, r7, ror #16\n\t" + "bic r3, r3, #0xff0000\n\t" + "ror r7, r7, #8\n\t" + "eor r7, r7, r3, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[ks], {r4, r5, r6, r7}\n\t" "mov r12, #10\n\t" "\n" "L_AES_set_encrypt_key_loop_128_%=: \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r7, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r7\n\t" +#endif +#else "ubfx r4, r7, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r7, #16\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r7, ror #8\n\t" +#endif +#else "ubfx r5, r7, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r7, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r7, ror #16\n\t" +#endif +#else "ubfx r6, r7, #16, #8\n\t" +#endif "lsr r7, r7, #24\n\t" "ldrb r4, [r8, r4, lsl #2]\n\t" "ldrb r5, [r8, r5, lsl #2]\n\t" @@ -555,43 +939,151 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t __asm__ __volatile__ ( "\n" "L_AES_encrypt_block_nr_%=: \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else "ubfx r8, r5, #16, #8\n\t" +#endif "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else "ubfx r2, r7, #0, #8\n\t" +#endif "ldr r8, [%[te], r8, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else "ubfx r9, r6, #16, #8\n\t" +#endif "eor r8, r8, r11, ror #24\n\t" "lsr r11, r5, #24\n\t" "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else "ubfx lr, r7, #8, #8\n\t" +#endif "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else "ubfx r2, r4, #0, #8\n\t" +#endif "ldr r9, [%[te], r9, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else "ubfx r10, r7, #16, #8\n\t" +#endif "eor r9, r9, r11, ror #24\n\t" "lsr r11, r6, #24\n\t" "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else "ubfx lr, r4, #8, #8\n\t" +#endif "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else "ubfx r2, r5, #0, #8\n\t" +#endif "ldr r10, [%[te], r10, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else "ubfx r6, r6, #0, #8\n\t" +#endif "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else "ubfx r11, r4, #16, #8\n\t" +#endif "eor r10, r10, lr, ror #8\n\t" "lsr lr, r7, #24\n\t" "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else "ubfx r2, r5, #8, #8\n\t" +#endif "ldr r6, [%[te], r6, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" @@ -605,43 +1097,151 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "eor r9, r9, r5\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r9, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r9, ror #16\n\t" +#endif +#else "ubfx r4, r9, #16, #8\n\t" +#endif "lsr r7, r8, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #8\n\t" +#endif +#else "ubfx lr, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r11, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r11\n\t" +#endif +#else "ubfx r2, r11, #0, #8\n\t" +#endif "ldr r4, [%[te], r4, lsl #2]\n\t" "ldr r7, [%[te], r7, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r10, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r10, ror #16\n\t" +#endif +#else "ubfx r5, r10, #16, #8\n\t" +#endif "eor r4, r4, r7, ror #24\n\t" "lsr r7, r9, #24\n\t" "eor r4, r4, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #8\n\t" +#endif +#else "ubfx lr, r11, #8, #8\n\t" +#endif "eor r4, r4, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8\n\t" +#endif +#else "ubfx r2, r8, #0, #8\n\t" +#endif "ldr r5, [%[te], r5, lsl #2]\n\t" "ldr r7, [%[te], r7, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r11, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r11, ror #16\n\t" +#endif +#else "ubfx r6, r11, #16, #8\n\t" +#endif "eor r5, r5, r7, ror #24\n\t" "lsr r7, r10, #24\n\t" "eor r5, r5, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r8, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r8, ror #8\n\t" +#endif +#else "ubfx lr, r8, #8, #8\n\t" +#endif "eor r5, r5, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9\n\t" +#endif +#else "ubfx r2, r9, #0, #8\n\t" +#endif "ldr r6, [%[te], r6, lsl #2]\n\t" "ldr r7, [%[te], r7, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r10, #24\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r10\n\t" +#endif +#else "ubfx r10, r10, #0, #8\n\t" +#endif "eor r6, r6, r7, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #8\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #16\n\t" +#endif +#else "ubfx r7, r8, #16, #8\n\t" +#endif "eor r6, r6, lr, ror #8\n\t" "lsr lr, r11, #24\n\t" "eor r6, r6, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r9, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r9, ror #8\n\t" +#endif +#else "ubfx r2, r9, #8, #8\n\t" +#endif "ldr r10, [%[te], r10, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r7, [%[te], r7, lsl #2]\n\t" @@ -657,43 +1257,151 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "eor r7, r7, r11\n\t" "subs %[nr], %[nr], #1\n\t" "bne L_AES_encrypt_block_nr_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r5, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r5, ror #16\n\t" +#endif +#else "ubfx r8, r5, #16, #8\n\t" +#endif "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6, ror #8\n\t" +#endif +#else "ubfx lr, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r7, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r7\n\t" +#endif +#else "ubfx r2, r7, #0, #8\n\t" +#endif "ldr r8, [%[te], r8, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r6, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r6, ror #16\n\t" +#endif +#else "ubfx r9, r6, #16, #8\n\t" +#endif "eor r8, r8, r11, ror #24\n\t" "lsr r11, r5, #24\n\t" "eor r8, r8, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7, ror #8\n\t" +#endif +#else "ubfx lr, r7, #8, #8\n\t" +#endif "eor r8, r8, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r4, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r4\n\t" +#endif +#else "ubfx r2, r4, #0, #8\n\t" +#endif "ldr r9, [%[te], r9, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r7, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r7, ror #16\n\t" +#endif +#else "ubfx r10, r7, #16, #8\n\t" +#endif "eor r9, r9, r11, ror #24\n\t" "lsr r11, r6, #24\n\t" "eor r9, r9, lr, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r4, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r4, ror #8\n\t" +#endif +#else "ubfx lr, r4, #8, #8\n\t" +#endif "eor r9, r9, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #24\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5\n\t" +#endif +#else "ubfx r2, r5, #0, #8\n\t" +#endif "ldr r10, [%[te], r10, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r6, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r6\n\t" +#endif +#else "ubfx r6, r6, #0, #8\n\t" +#endif "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r4, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r4, ror #16\n\t" +#endif +#else "ubfx r11, r4, #16, #8\n\t" +#endif "eor r10, r10, lr, ror #8\n\t" "lsr lr, r7, #24\n\t" "eor r10, r10, r2, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r5, #16\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r5, ror #8\n\t" +#endif +#else "ubfx r2, r5, #8, #8\n\t" +#endif "ldr r6, [%[te], r6, lsl #2]\n\t" "ldr lr, [%[te], lr, lsl #2]\n\t" "ldr r11, [%[te], r11, lsl #2]\n\t" @@ -707,30 +1415,111 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "eor r9, r9, r5\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r11, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r11\n\t" +#endif +#else "ubfx r4, r11, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #8\n\t" +#endif +#else "ubfx r7, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #16\n\t" +#endif +#else "ubfx lr, r9, #16, #8\n\t" +#endif "lsr r2, r8, #24\n\t" "ldrb r4, [%[te], r4, lsl #2]\n\t" "ldrb r7, [%[te], r7, lsl #2]\n\t" "ldrb lr, [%[te], lr, lsl #2]\n\t" "ldrb r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r8, #24\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r8\n\t" +#endif +#else "ubfx r5, r8, #0, #8\n\t" +#endif "eor r4, r4, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r11, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r11, ror #8\n\t" +#endif +#else "ubfx r7, r11, #8, #8\n\t" +#endif "eor r4, r4, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #16\n\t" +#endif +#else "ubfx lr, r10, #16, #8\n\t" +#endif "eor r4, r4, r2, lsl #24\n\t" "lsr r2, r9, #24\n\t" "ldrb r5, [%[te], r5, lsl #2]\n\t" "ldrb r7, [%[te], r7, lsl #2]\n\t" "ldrb lr, [%[te], lr, lsl #2]\n\t" "ldrb r2, [%[te], r2, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r9, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r9\n\t" +#endif +#else "ubfx r6, r9, #0, #8\n\t" +#endif "eor r5, r5, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #8\n\t" +#endif +#else "ubfx r7, r8, #8, #8\n\t" +#endif "eor r5, r5, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11, ror #16\n\t" +#endif +#else "ubfx lr, r11, #16, #8\n\t" +#endif "eor r5, r5, r2, lsl #24\n\t" "lsr r2, r10, #24\n\t" "ldrb r6, [%[te], r6, lsl #2]\n\t" @@ -739,11 +1528,38 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "ldrb r2, [%[te], r2, lsl #2]\n\t" "lsr r11, r11, #24\n\t" "eor r6, r6, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #24\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10\n\t" +#endif +#else "ubfx r7, r10, #0, #8\n\t" +#endif "eor r6, r6, lr, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #8\n\t" +#endif +#else "ubfx lr, r9, #8, #8\n\t" +#endif "eor r6, r6, r2, lsl #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r2, r8, #8\n\t" + "lsr r2, r2, #24\n\t" +#else + "uxtb r2, r8, ror #16\n\t" +#endif +#else "ubfx r2, r8, #16, #8\n\t" +#endif "ldrb r11, [%[te], r11, lsl #2]\n\t" "ldrb r7, [%[te], r7, lsl #2]\n\t" "ldrb lr, [%[te], lr, lsl #2]\n\t" @@ -763,10 +1579,8 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t ); } -#if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) -static const uint32_t* L_AES_ARM32_te_ecb = L_AES_ARM32_te_data; -#endif /* HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +static const uint32_t* L_AES_ARM32_te_ecb = L_AES_ARM32_te_data; void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr); void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p) { @@ -792,10 +1606,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[len], lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -807,10 +1640,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -828,10 +1680,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[len], lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -843,10 +1714,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -864,10 +1754,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[len], lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -879,10 +1788,29 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -898,11 +1826,11 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; } #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC +static const uint32_t* L_AES_ARM32_te_cbc = L_AES_ARM32_te_data; void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* iv_p) { @@ -912,13 +1840,13 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* iv asm ("r5") = (unsigned char*)iv_p; - register uint32_t* L_AES_ARM32_te_ecb_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_ecb; + register uint32_t* L_AES_ARM32_te_cbc_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_cbc; __asm__ __volatile__ ( "mov r8, r4\n\t" "mov r9, r5\n\t" "mov lr, %[in]\n\t" - "mov r0, %[L_AES_ARM32_te_ecb]\n\t" + "mov r0, %[L_AES_ARM32_te_cbc]\n\t" "ldm r9, {r4, r5, r6, r7}\n\t" "push {%[ks], r9}\n\t" "cmp r8, #10\n\t" @@ -936,11 +1864,30 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" "push {r1, %[len], lr}\n\t" - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -950,10 +1897,29 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -976,11 +1942,30 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" "push {r1, %[len], lr}\n\t" - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -990,10 +1975,29 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -1016,11 +2020,30 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" "push {r1, %[len], lr}\n\t" - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1030,10 +2053,29 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -1046,16 +2088,15 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "L_AES_CBC_encrypt_end_%=: \n\t" "pop {%[ks], r9}\n\t" "stm r9, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_ARM32_te_cbc] "+r" (L_AES_ARM32_te_cbc_c) : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)iv; } #endif /* HAVE_AES_CBC */ #ifdef WOLFSSL_AES_COUNTER +static const uint32_t* L_AES_ARM32_te_ctr = L_AES_ARM32_te_data; void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned long len_p, const unsigned char* ks_p, int nr_p, unsigned char* ctr_p) { @@ -1065,18 +2106,37 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l register const unsigned char* ks asm ("r3") = (const unsigned char*)ks_p; register int nr asm ("r4") = (int)nr_p; register unsigned char* ctr asm ("r5") = (unsigned char*)ctr_p; - register uint32_t* L_AES_ARM32_te_ecb_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_ecb; + register uint32_t* L_AES_ARM32_te_ctr_c asm ("r6") = (uint32_t*)L_AES_ARM32_te_ctr; __asm__ __volatile__ ( "mov r12, r4\n\t" "mov r8, r5\n\t" "mov lr, %[in]\n\t" - "mov r0, %[L_AES_ARM32_te_ecb]\n\t" + "mov r0, %[L_AES_ARM32_te_ctr]\n\t" "ldm r8, {r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" "push {%[ks], r8}\n\t" "cmp r12, #10\n\t" @@ -1102,10 +2162,29 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -1146,10 +2225,29 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -1190,10 +2288,29 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -1215,17 +2332,34 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "\n" "L_AES_CTR_encrypt_end_%=: \n\t" "pop {%[ks], r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_ecb] "+r" (L_AES_ARM32_te_ecb_c) + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_ctr] "+r" (L_AES_ARM32_te_ctr_c) : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)ctr; } #endif /* WOLFSSL_AES_COUNTER */ @@ -1241,43 +2375,151 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) __asm__ __volatile__ ( "\n" "L_AES_decrypt_block_nr_%=: \n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r7, ror #16\n\t" +#endif +#else "ubfx r8, r7, #16, #8\n\t" +#endif "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r6, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r6, ror #8\n\t" +#endif +#else "ubfx r12, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r5, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r5\n\t" +#endif +#else "ubfx lr, r5, #0, #8\n\t" +#endif "ldr r8, [%[td], r8, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r4, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r4, ror #16\n\t" +#endif +#else "ubfx r9, r4, #16, #8\n\t" +#endif "eor r8, r8, r11, ror #24\n\t" "lsr r11, r5, #24\n\t" "eor r8, r8, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r7, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r7, ror #8\n\t" +#endif +#else "ubfx r12, r7, #8, #8\n\t" +#endif "eor r8, r8, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6\n\t" +#endif +#else "ubfx lr, r6, #0, #8\n\t" +#endif "ldr r9, [%[td], r9, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r5, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r5, ror #16\n\t" +#endif +#else "ubfx r10, r5, #16, #8\n\t" +#endif "eor r9, r9, r11, ror #24\n\t" "lsr r11, r6, #24\n\t" "eor r9, r9, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r4, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r4, ror #8\n\t" +#endif +#else "ubfx r12, r4, #8, #8\n\t" +#endif "eor r9, r9, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7\n\t" +#endif +#else "ubfx lr, r7, #0, #8\n\t" +#endif "ldr r10, [%[td], r10, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r4, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r4\n\t" +#endif +#else "ubfx r4, r4, #0, #8\n\t" +#endif "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r6, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r6, ror #16\n\t" +#endif +#else "ubfx r11, r6, #16, #8\n\t" +#endif "eor r10, r10, r12, ror #8\n\t" "lsr r12, r7, #24\n\t" "eor r10, r10, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r5, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r5, ror #8\n\t" +#endif +#else "ubfx lr, r5, #8, #8\n\t" +#endif "ldr r4, [%[td], r4, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" @@ -1291,43 +2533,151 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "eor r9, r9, r5\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r11, #8\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r11, ror #16\n\t" +#endif +#else "ubfx r4, r11, #16, #8\n\t" +#endif "lsr r7, r8, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r10, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r10, ror #8\n\t" +#endif +#else "ubfx r12, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9\n\t" +#endif +#else "ubfx lr, r9, #0, #8\n\t" +#endif "ldr r4, [%[td], r4, lsl #2]\n\t" "ldr r7, [%[td], r7, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r8, #8\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r8, ror #16\n\t" +#endif +#else "ubfx r5, r8, #16, #8\n\t" +#endif "eor r4, r4, r7, ror #24\n\t" "lsr r7, r9, #24\n\t" "eor r4, r4, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r11, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r11, ror #8\n\t" +#endif +#else "ubfx r12, r11, #8, #8\n\t" +#endif "eor r4, r4, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10\n\t" +#endif +#else "ubfx lr, r10, #0, #8\n\t" +#endif "ldr r5, [%[td], r5, lsl #2]\n\t" "ldr r7, [%[td], r7, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r9, #8\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r9, ror #16\n\t" +#endif +#else "ubfx r6, r9, #16, #8\n\t" +#endif "eor r5, r5, r7, ror #24\n\t" "lsr r7, r10, #24\n\t" "eor r5, r5, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r8, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r8, ror #8\n\t" +#endif +#else "ubfx r12, r8, #8, #8\n\t" +#endif "eor r5, r5, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r11, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r11\n\t" +#endif +#else "ubfx lr, r11, #0, #8\n\t" +#endif "ldr r6, [%[td], r6, lsl #2]\n\t" "ldr r7, [%[td], r7, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r8, #24\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r8\n\t" +#endif +#else "ubfx r8, r8, #0, #8\n\t" +#endif "eor r6, r6, r7, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #8\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #16\n\t" +#endif +#else "ubfx r7, r10, #16, #8\n\t" +#endif "eor r6, r6, r12, ror #8\n\t" "lsr r12, r11, #24\n\t" "eor r6, r6, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r9, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r9, ror #8\n\t" +#endif +#else "ubfx lr, r9, #8, #8\n\t" +#endif "ldr r8, [%[td], r8, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr r7, [%[td], r7, lsl #2]\n\t" @@ -1343,43 +2693,151 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "eor r7, r7, r11\n\t" "subs %[nr], %[nr], #1\n\t" "bne L_AES_decrypt_block_nr_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r8, r7, #8\n\t" + "lsr r8, r8, #24\n\t" +#else + "uxtb r8, r7, ror #16\n\t" +#endif +#else "ubfx r8, r7, #16, #8\n\t" +#endif "lsr r11, r4, #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r6, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r6, ror #8\n\t" +#endif +#else "ubfx r12, r6, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r5, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r5\n\t" +#endif +#else "ubfx lr, r5, #0, #8\n\t" +#endif "ldr r8, [%[td], r8, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r9, r4, #8\n\t" + "lsr r9, r9, #24\n\t" +#else + "uxtb r9, r4, ror #16\n\t" +#endif +#else "ubfx r9, r4, #16, #8\n\t" +#endif "eor r8, r8, r11, ror #24\n\t" "lsr r11, r5, #24\n\t" "eor r8, r8, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r7, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r7, ror #8\n\t" +#endif +#else "ubfx r12, r7, #8, #8\n\t" +#endif "eor r8, r8, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r6, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r6\n\t" +#endif +#else "ubfx lr, r6, #0, #8\n\t" +#endif "ldr r9, [%[td], r9, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r10, r5, #8\n\t" + "lsr r10, r10, #24\n\t" +#else + "uxtb r10, r5, ror #16\n\t" +#endif +#else "ubfx r10, r5, #16, #8\n\t" +#endif "eor r9, r9, r11, ror #24\n\t" "lsr r11, r6, #24\n\t" "eor r9, r9, r12, ror #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r4, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r4, ror #8\n\t" +#endif +#else "ubfx r12, r4, #8, #8\n\t" +#endif "eor r9, r9, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r7, #24\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r7\n\t" +#endif +#else "ubfx lr, r7, #0, #8\n\t" +#endif "ldr r10, [%[td], r10, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr lr, [%[td], lr, lsl #2]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r4, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r4\n\t" +#endif +#else "ubfx r4, r4, #0, #8\n\t" +#endif "eor r10, r10, r11, ror #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r11, r6, #8\n\t" + "lsr r11, r11, #24\n\t" +#else + "uxtb r11, r6, ror #16\n\t" +#endif +#else "ubfx r11, r6, #16, #8\n\t" +#endif "eor r10, r10, r12, ror #8\n\t" "lsr r12, r7, #24\n\t" "eor r10, r10, lr, ror #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r5, #16\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r5, ror #8\n\t" +#endif +#else "ubfx lr, r5, #8, #8\n\t" +#endif "ldr r4, [%[td], r4, lsl #2]\n\t" "ldr r12, [%[td], r12, lsl #2]\n\t" "ldr r11, [%[td], r11, lsl #2]\n\t" @@ -1393,30 +2851,111 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "eor r9, r9, r5\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r4, r9, #24\n\t" + "lsr r4, r4, #24\n\t" +#else + "uxtb r4, r9\n\t" +#endif +#else "ubfx r4, r9, #0, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r10, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r10, ror #8\n\t" +#endif +#else "ubfx r7, r10, #8, #8\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r11, #8\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r11, ror #16\n\t" +#endif +#else "ubfx r12, r11, #16, #8\n\t" +#endif "lsr lr, r8, #24\n\t" "ldrb r4, [%[td4], r4]\n\t" "ldrb r7, [%[td4], r7]\n\t" "ldrb r12, [%[td4], r12]\n\t" "ldrb lr, [%[td4], lr]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r5, r10, #24\n\t" + "lsr r5, r5, #24\n\t" +#else + "uxtb r5, r10\n\t" +#endif +#else "ubfx r5, r10, #0, #8\n\t" +#endif "eor r4, r4, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r11, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r11, ror #8\n\t" +#endif +#else "ubfx r7, r11, #8, #8\n\t" +#endif "eor r4, r4, r12, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r8, #8\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r8, ror #16\n\t" +#endif +#else "ubfx r12, r8, #16, #8\n\t" +#endif "eor r4, r4, lr, lsl #24\n\t" "lsr lr, r9, #24\n\t" "ldrb r7, [%[td4], r7]\n\t" "ldrb lr, [%[td4], lr]\n\t" "ldrb r5, [%[td4], r5]\n\t" "ldrb r12, [%[td4], r12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r6, r11, #24\n\t" + "lsr r6, r6, #24\n\t" +#else + "uxtb r6, r11\n\t" +#endif +#else "ubfx r6, r11, #0, #8\n\t" +#endif "eor r5, r5, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8, ror #8\n\t" +#endif +#else "ubfx r7, r8, #8, #8\n\t" +#endif "eor r5, r5, r12, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r9, #8\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r9, ror #16\n\t" +#endif +#else "ubfx r12, r9, #16, #8\n\t" +#endif "eor r5, r5, lr, lsl #24\n\t" "lsr lr, r10, #24\n\t" "ldrb r7, [%[td4], r7]\n\t" @@ -1425,11 +2964,38 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "ldrb r12, [%[td4], r12]\n\t" "lsr r11, r11, #24\n\t" "eor r6, r6, r7, lsl #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r7, r8, #24\n\t" + "lsr r7, r7, #24\n\t" +#else + "uxtb r7, r8\n\t" +#endif +#else "ubfx r7, r8, #0, #8\n\t" +#endif "eor r6, r6, r12, lsl #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl r12, r9, #16\n\t" + "lsr r12, r12, #24\n\t" +#else + "uxtb r12, r9, ror #8\n\t" +#endif +#else "ubfx r12, r9, #8, #8\n\t" +#endif "eor r6, r6, lr, lsl #24\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl lr, r10, #8\n\t" + "lsr lr, lr, #24\n\t" +#else + "uxtb lr, r10, ror #16\n\t" +#endif +#else "ubfx lr, r10, #16, #8\n\t" +#endif "ldrb r11, [%[td4], r11]\n\t" "ldrb r12, [%[td4], r12]\n\t" "ldrb r7, [%[td4], r7]\n\t" @@ -1513,10 +3079,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[ks], r12, lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -1527,10 +3112,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #6\n\t" "bl AES_decrypt_block\n\t" "pop {r1, %[ks], r12, lr}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -1548,10 +3152,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[ks], r12, lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -1562,10 +3185,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #5\n\t" "bl AES_decrypt_block\n\t" "pop {r1, %[ks], r12, lr}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -1583,10 +3225,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r5, [lr, #4]\n\t" "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "push {r1, %[ks], r12, lr}\n\t" "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ @@ -1597,10 +3258,29 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #4\n\t" "bl AES_decrypt_block\n\t" "pop {r1, %[ks], r12, lr}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "str r4, [%[out]]\n\t" "str r5, [%[out], #4]\n\t" "str r6, [%[out], #8]\n\t" @@ -1615,7 +3295,6 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; } #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ @@ -1652,23 +3331,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr, #16]\n\t" "str r5, [lr, #20]\n\t" #else "strd r4, r5, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #24]\n\t" "str r7, [lr, #28]\n\t" #else "strd r6, r7, [lr, #24]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1677,10 +3375,29 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #6\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldm lr, {r8, r9, r10, r11}\n\t" "pop {r1, r12, lr}\n\t" "ldr %[ks], [sp]\n\t" @@ -1702,23 +3419,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr]\n\t" "str r5, [lr, #4]\n\t" #else "strd r4, r5, [lr]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #8]\n\t" "str r7, [lr, #12]\n\t" #else "strd r6, r7, [lr, #8]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1727,17 +3463,36 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #6\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [lr, #16]\n\t" "ldr r9, [lr, #20]\n\t" #else "ldrd r8, r9, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [lr, #24]\n\t" "ldr r11, [lr, #28]\n\t" #else @@ -1766,23 +3521,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr, #16]\n\t" "str r5, [lr, #20]\n\t" #else "strd r4, r5, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #24]\n\t" "str r7, [lr, #28]\n\t" #else "strd r6, r7, [lr, #24]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1791,10 +3565,29 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #5\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldm lr, {r8, r9, r10, r11}\n\t" "pop {r1, r12, lr}\n\t" "ldr %[ks], [sp]\n\t" @@ -1816,23 +3609,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr]\n\t" "str r5, [lr, #4]\n\t" #else "strd r4, r5, [lr]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #8]\n\t" "str r7, [lr, #12]\n\t" #else "strd r6, r7, [lr, #8]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1841,17 +3653,36 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #5\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [lr, #16]\n\t" "ldr r9, [lr, #20]\n\t" #else "ldrd r8, r9, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [lr, #24]\n\t" "ldr r11, [lr, #28]\n\t" #else @@ -1880,23 +3711,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr, #16]\n\t" "str r5, [lr, #20]\n\t" #else "strd r4, r5, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #24]\n\t" "str r7, [lr, #28]\n\t" #else "strd r6, r7, [lr, #24]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1905,10 +3755,29 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #4\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldm lr, {r8, r9, r10, r11}\n\t" "pop {r1, r12, lr}\n\t" "ldr %[ks], [sp]\n\t" @@ -1930,23 +3799,42 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "ldr r6, [lr, #8]\n\t" "ldr r7, [lr, #12]\n\t" "ldr lr, [sp, #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [lr]\n\t" "str r5, [lr, #4]\n\t" #else "strd r4, r5, [lr]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [lr, #8]\n\t" "str r7, [lr, #12]\n\t" #else "strd r6, r7, [lr, #8]\n\t" #endif - "ldm %[ks]!, {r8, r9, r10, r11}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "ldm %[ks]!, {r8, r9, r10, r11}\n\t" /* Round: 0 - XOR in key schedule */ "eor r4, r4, r8\n\t" "eor r5, r5, r9\n\t" @@ -1955,17 +3843,36 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov r1, #4\n\t" "bl AES_decrypt_block\n\t" "ldr lr, [sp, #16]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [lr, #16]\n\t" "ldr r9, [lr, #20]\n\t" #else "ldrd r8, r9, [lr, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [lr, #24]\n\t" "ldr r11, [lr, #28]\n\t" #else @@ -1989,25 +3896,25 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "\n" "L_AES_CBC_decrypt_end_odd_%=: \n\t" "ldr r4, [sp, #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [r4, #16]\n\t" "ldr r9, [r4, #20]\n\t" #else "ldrd r8, r9, [r4, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [r4, #24]\n\t" "ldr r11, [r4, #28]\n\t" #else "ldrd r10, r11, [r4, #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r4]\n\t" "str r9, [r4, #4]\n\t" #else "strd r8, r9, [r4]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [r4, #8]\n\t" "str r11, [r4, #12]\n\t" #else @@ -2020,8 +3927,6 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)iv; } #endif /* HAVE_AES_CBC */ @@ -2584,10 +4489,33 @@ void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned "eor r9, r9, r5\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* REV r8, r8 */ + "eor %[len], r8, r8, ror #16\n\t" + "bic %[len], %[len], #0xff0000\n\t" + "ror r8, r8, #8\n\t" + "eor r8, r8, %[len], lsr #8\n\t" + /* REV r9, r9 */ + "eor %[len], r9, r9, ror #16\n\t" + "bic %[len], %[len], #0xff0000\n\t" + "ror r9, r9, #8\n\t" + "eor r9, r9, %[len], lsr #8\n\t" + /* REV r10, r10 */ + "eor %[len], r10, r10, ror #16\n\t" + "bic %[len], %[len], #0xff0000\n\t" + "ror r10, r10, #8\n\t" + "eor r10, r10, %[len], lsr #8\n\t" + /* REV r11, r11 */ + "eor %[len], r11, r11, ror #16\n\t" + "bic %[len], %[len], #0xff0000\n\t" + "ror r11, r11, #8\n\t" + "eor r11, r11, %[len], lsr #8\n\t" +#else "rev r8, r8\n\t" "rev r9, r9\n\t" "rev r10, r10\n\t" "rev r11, r11\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[x], {r8, r9, r10, r11}\n\t" "pop {r3}\n\t" "subs %[len], %[len], #16\n\t" @@ -2617,10 +4545,29 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "mov lr, %[in]\n\t" "mov r0, %[L_AES_ARM32_te_gcm]\n\t" "ldm r8, {r4, r5, r6, r7}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" "push {%[ks], r8}\n\t" "cmp r12, #10\n\t" @@ -2643,10 +4590,29 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -2684,10 +4650,29 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -2725,10 +4710,29 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "bl AES_encrypt_block\n\t" "pop {r1, %[len], lr}\n\t" "ldr %[ks], [sp]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r8, [lr]\n\t" "ldr r9, [lr, #4]\n\t" "ldr r10, [lr, #8]\n\t" @@ -2750,21 +4754,41 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "\n" "L_AES_GCM_encrypt_end_%=: \n\t" "pop {%[ks], r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "eor r10, r4, r4, ror #16\n\t" + "eor r11, r5, r5, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "eor r4, r4, r10, lsr #8\n\t" + "eor r5, r5, r11, lsr #8\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#else "rev r4, r4\n\t" "rev r5, r5\n\t" "rev r6, r6\n\t" "rev r7, r7\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm r8, {r4, r5, r6, r7}\n\t" : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_ARM32_te_gcm] "+r" (L_AES_ARM32_te_gcm_c) : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)ctr; } #endif /* HAVE_AESGCM */ #endif /* !NO_AES */ #endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519.S b/wolfcrypt/src/port/arm/armv8-32-curve25519.S index 45be9a90e..52cdcf41a 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519.S @@ -49,13 +49,13 @@ fe_init: fe_add_sub_op: push {lr} # Add-Sub -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r2] ldr r5, [r2, #4] #else ldrd r4, r5, [r2] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3] ldr r7, [r3, #4] #else @@ -66,7 +66,7 @@ fe_add_sub_op: mov r12, #0 adcs r9, r5, r7 adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0] str r9, [r0, #4] #else @@ -75,19 +75,19 @@ fe_add_sub_op: # Sub subs r10, r4, r6 sbcs r11, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [r1] str r11, [r1, #4] #else strd r10, r11, [r1] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r2, #8] ldr r5, [r2, #12] #else ldrd r4, r5, [r2, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #8] ldr r7, [r3, #12] #else @@ -98,7 +98,7 @@ fe_add_sub_op: mov lr, #0 sbcs r11, r5, r7 adc lr, lr, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [r1, #8] str r11, [r1, #12] #else @@ -108,19 +108,19 @@ fe_add_sub_op: subs r12, r12, #1 adcs r8, r4, r6 adcs r9, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #8] str r9, [r0, #12] #else strd r8, r9, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r2, #16] ldr r5, [r2, #20] #else ldrd r4, r5, [r2, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #16] ldr r7, [r3, #20] #else @@ -131,7 +131,7 @@ fe_add_sub_op: mov r12, #0 adcs r9, r5, r7 adc r12, r12, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #16] str r9, [r0, #20] #else @@ -141,19 +141,19 @@ fe_add_sub_op: subs lr, lr, #1 sbcs r10, r4, r6 sbcs r11, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [r1, #16] str r11, [r1, #20] #else strd r10, r11, [r1, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r2, #24] ldr r5, [r2, #28] #else ldrd r4, r5, [r2, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #24] ldr r7, [r3, #28] #else @@ -175,13 +175,13 @@ fe_add_sub_op: orr r3, r3, r9, lsr #31 mul r12, r3, r12 # Add -x*modulus (if overflow) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else @@ -191,19 +191,19 @@ fe_add_sub_op: adcs r5, r5, #0 adcs r6, r6, #0 adcs r7, r7, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else strd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -211,16 +211,20 @@ fe_add_sub_op: #endif adcs r4, r4, #0 adcs r5, r5, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else bfc r9, #31, #1 +#endif adcs r8, r8, #0 adc r9, r9, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else @@ -239,7 +243,11 @@ fe_add_sub_op: sbcs r7, r7, #0 sbcs r8, r8, #0 sbcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif sbcs r10, r10, #0 sbc r11, r11, #0 stm r1, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -275,7 +283,11 @@ fe_sub_op: sbcs r9, r9, #0 sbcs r10, r10, #0 sbcs r11, r11, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic lr, lr, #0x80000000 +#else bfc lr, #31, #1 +#endif sbcs r12, r12, #0 sbc lr, lr, #0 stm r0, {r6, r7, r8, r9, r10, r11, r12, lr} @@ -321,7 +333,11 @@ fe_add_op: adcs r9, r9, #0 adcs r10, r10, #0 adcs r11, r11, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic lr, lr, #0x80000000 +#else bfc lr, #31, #1 +#endif adcs r12, r12, #0 adc lr, lr, #0 stm r0, {r6, r7, r8, r9, r10, r11, r12, lr} @@ -352,7 +368,11 @@ fe_frombytes: ldr r7, [r1, #20] ldr r8, [r1, #24] ldr r9, [r1, #28] +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else bfc r9, #31, #1 +#endif str r2, [r0] str r3, [r0, #4] str r4, [r0, #8] @@ -388,7 +408,11 @@ fe_tobytes: adcs r7, r7, #0 adcs r8, r8, #0 adc r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else bfc r9, #31, #1 +#endif str r2, [r0] str r3, [r0, #4] str r4, [r0, #8] @@ -442,49 +466,49 @@ fe_0: fe_copy: push {r4, r5, lr} # Copy -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r2, [r1] ldr r3, [r1, #4] #else ldrd r2, r3, [r1] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r1, #8] ldr r5, [r1, #12] #else ldrd r4, r5, [r1, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r2, [r0] str r3, [r0, #4] #else strd r2, r3, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r2, [r1, #16] ldr r3, [r1, #20] #else ldrd r2, r3, [r1, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r1, #24] ldr r5, [r1, #28] #else ldrd r4, r5, [r1, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r2, [r0, #16] str r3, [r0, #20] #else strd r2, r3, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -540,7 +564,11 @@ fe_isnonzero: adcs r7, r7, #0 adcs r8, r8, #0 adc r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else bfc r9, #31, #1 +#endif orr r2, r2, r3 orr r4, r4, r5 orr r6, r6, r7 @@ -580,8 +608,18 @@ fe_isnegative: .type fe_cmov_table, %function fe_cmov_table: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r2, #24 + asr r2, r2, #24 +#else sxtb r2, r2 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r3, r2, #24 + asr r3, r2, #31 +#else sbfx r3, r2, #7, #1 +#endif eor r12, r2, r3 sub r12, r12, r3 mov r4, #1 @@ -590,7 +628,7 @@ fe_cmov_table: mov r7, #0 mov r8, #0 mov r9, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -600,7 +638,7 @@ fe_cmov_table: ror r3, r3, #31 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -612,7 +650,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -624,7 +662,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -637,7 +675,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -647,7 +685,7 @@ fe_cmov_table: ror r3, r3, #30 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -659,7 +697,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -671,7 +709,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -684,7 +722,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -694,7 +732,7 @@ fe_cmov_table: ror r3, r3, #29 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -706,7 +744,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -718,7 +756,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -731,7 +769,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -741,7 +779,7 @@ fe_cmov_table: ror r3, r3, #28 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -753,7 +791,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -765,7 +803,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -778,7 +816,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -788,7 +826,7 @@ fe_cmov_table: ror r3, r3, #27 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -800,7 +838,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -812,7 +850,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -825,7 +863,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -835,7 +873,7 @@ fe_cmov_table: ror r3, r3, #26 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -847,7 +885,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -859,7 +897,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -872,7 +910,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -882,7 +920,7 @@ fe_cmov_table: ror r3, r3, #25 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -894,7 +932,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -906,7 +944,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -919,7 +957,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -929,7 +967,7 @@ fe_cmov_table: ror r3, r3, #24 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1] ldr r11, [r1, #4] #else @@ -941,7 +979,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #32] ldr r11, [r1, #36] #else @@ -953,7 +991,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #64] ldr r11, [r1, #68] #else @@ -986,25 +1024,30 @@ fe_cmov_table: eor r11, r11, r9 and r11, r11, r12 eor r9, r9, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #32] str r7, [r0, #36] #else strd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #64] str r9, [r0, #68] #else strd r8, r9, [r0, #64] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r3, r2, #24 + asr r3, r2, #31 +#else sbfx r3, r2, #7, #1 +#endif eor r12, r2, r3 sub r12, r12, r3 mov r4, #0 @@ -1013,7 +1056,7 @@ fe_cmov_table: mov r7, #0 mov r8, #0 mov r9, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1023,7 +1066,7 @@ fe_cmov_table: ror r3, r3, #31 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1035,7 +1078,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1047,7 +1090,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1060,7 +1103,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1070,7 +1113,7 @@ fe_cmov_table: ror r3, r3, #30 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1082,7 +1125,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1094,7 +1137,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1107,7 +1150,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1117,7 +1160,7 @@ fe_cmov_table: ror r3, r3, #29 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1129,7 +1172,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1141,7 +1184,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1154,7 +1197,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1164,7 +1207,7 @@ fe_cmov_table: ror r3, r3, #28 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1176,7 +1219,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1188,7 +1231,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1201,7 +1244,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1211,7 +1254,7 @@ fe_cmov_table: ror r3, r3, #27 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1223,7 +1266,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1235,7 +1278,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1248,7 +1291,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1258,7 +1301,7 @@ fe_cmov_table: ror r3, r3, #26 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1270,7 +1313,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1282,7 +1325,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1295,7 +1338,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1305,7 +1348,7 @@ fe_cmov_table: ror r3, r3, #25 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1317,7 +1360,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1329,7 +1372,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1342,7 +1385,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1352,7 +1395,7 @@ fe_cmov_table: ror r3, r3, #24 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #8] ldr r11, [r1, #12] #else @@ -1364,7 +1407,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #40] ldr r11, [r1, #44] #else @@ -1376,7 +1419,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #72] ldr r11, [r1, #76] #else @@ -1410,25 +1453,30 @@ fe_cmov_table: eor r11, r11, r9 and r11, r11, r12 eor r9, r9, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else strd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #72] str r9, [r0, #76] #else strd r8, r9, [r0, #72] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r3, r2, #24 + asr r3, r2, #31 +#else sbfx r3, r2, #7, #1 +#endif eor r12, r2, r3 sub r12, r12, r3 mov r4, #0 @@ -1437,7 +1485,7 @@ fe_cmov_table: mov r7, #0 mov r8, #0 mov r9, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1447,7 +1495,7 @@ fe_cmov_table: ror r3, r3, #31 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1459,7 +1507,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1471,7 +1519,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1484,7 +1532,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1494,7 +1542,7 @@ fe_cmov_table: ror r3, r3, #30 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1506,7 +1554,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1518,7 +1566,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1531,7 +1579,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1541,7 +1589,7 @@ fe_cmov_table: ror r3, r3, #29 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1553,7 +1601,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1565,7 +1613,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1578,7 +1626,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1588,7 +1636,7 @@ fe_cmov_table: ror r3, r3, #28 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1600,7 +1648,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1612,7 +1660,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1625,7 +1673,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1635,7 +1683,7 @@ fe_cmov_table: ror r3, r3, #27 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1647,7 +1695,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1659,7 +1707,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1672,7 +1720,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1682,7 +1730,7 @@ fe_cmov_table: ror r3, r3, #26 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1694,7 +1742,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1706,7 +1754,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1719,7 +1767,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1729,7 +1777,7 @@ fe_cmov_table: ror r3, r3, #25 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1741,7 +1789,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1753,7 +1801,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1766,7 +1814,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1776,7 +1824,7 @@ fe_cmov_table: ror r3, r3, #24 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #16] ldr r11, [r1, #20] #else @@ -1788,7 +1836,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #48] ldr r11, [r1, #52] #else @@ -1800,7 +1848,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #80] ldr r11, [r1, #84] #else @@ -1834,25 +1882,30 @@ fe_cmov_table: eor r11, r11, r9 and r11, r11, r12 eor r9, r9, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #48] str r7, [r0, #52] #else strd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #80] str r9, [r0, #84] #else strd r8, r9, [r0, #80] #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r3, r2, #24 + asr r3, r2, #31 +#else sbfx r3, r2, #7, #1 +#endif eor r12, r2, r3 sub r12, r12, r3 mov r4, #0 @@ -1861,7 +1914,7 @@ fe_cmov_table: mov r7, #0 mov r8, #0 mov r9, #0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1871,7 +1924,7 @@ fe_cmov_table: ror r3, r3, #31 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -1883,7 +1936,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -1895,7 +1948,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -1908,7 +1961,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1918,7 +1971,7 @@ fe_cmov_table: ror r3, r3, #30 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -1930,7 +1983,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -1942,7 +1995,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -1955,7 +2008,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -1965,7 +2018,7 @@ fe_cmov_table: ror r3, r3, #29 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -1977,7 +2030,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -1989,7 +2042,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2002,7 +2055,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -2012,7 +2065,7 @@ fe_cmov_table: ror r3, r3, #28 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -2024,7 +2077,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -2036,7 +2089,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2049,7 +2102,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -2059,7 +2112,7 @@ fe_cmov_table: ror r3, r3, #27 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -2071,7 +2124,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -2083,7 +2136,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2096,7 +2149,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -2106,7 +2159,7 @@ fe_cmov_table: ror r3, r3, #26 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -2118,7 +2171,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -2130,7 +2183,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2143,7 +2196,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -2153,7 +2206,7 @@ fe_cmov_table: ror r3, r3, #25 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -2165,7 +2218,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -2177,7 +2230,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2190,7 +2243,7 @@ fe_cmov_table: eor r8, r8, r10 eor r9, r9, r11 add r1, r1, #0x60 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x800000 lsl r3, r3, #8 add r3, r3, #0x0 @@ -2200,7 +2253,7 @@ fe_cmov_table: ror r3, r3, #24 ror r3, r3, r12 asr r3, r3, #31 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #24] ldr r11, [r1, #28] #else @@ -2212,7 +2265,7 @@ fe_cmov_table: and r11, r11, r3 eor r4, r4, r10 eor r5, r5, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #56] ldr r11, [r1, #60] #else @@ -2224,7 +2277,7 @@ fe_cmov_table: and r11, r11, r3 eor r6, r6, r10 eor r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r1, #88] ldr r11, [r1, #92] #else @@ -2257,19 +2310,19 @@ fe_cmov_table: eor r11, r11, r9 and r11, r11, r12 eor r9, r9, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else strd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else strd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #88] str r9, [r0, #92] #else @@ -2284,12 +2337,26 @@ fe_cmov_table: .type fe_cmov_table, %function fe_cmov_table: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + lsl r2, r2, #24 + asr r2, r2, #24 +#else sxtb r2, r2 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + lsl r3, r2, #24 + asr r3, r2, #31 +#else sbfx r3, r2, #7, #1 +#endif eor r2, r2, r3 sub r2, r2, r3 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + sub lr, r2, #1 +#else clz lr, r2 lsl lr, lr, #26 +#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */ asr lr, lr, #31 mvn lr, lr add r2, r2, lr @@ -2381,6 +2448,389 @@ fe_cmov_table: #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + .text + .align 4 + .globl fe_mul_op + .type fe_mul_op, %function +fe_mul_op: + push {lr} + sub sp, sp, #40 + str r0, [sp, #36] + mov r0, #0 + ldr r12, [r1] + # A[0] * B[0] + ldr lr, [r2] + umull r3, r4, r12, lr + # A[0] * B[2] + ldr lr, [r2, #8] + umull r5, r6, r12, lr + # A[0] * B[4] + ldr lr, [r2, #16] + umull r7, r8, r12, lr + # A[0] * B[6] + ldr lr, [r2, #24] + umull r9, r10, r12, lr + str r3, [sp] + # A[0] * B[1] + ldr lr, [r2, #4] + mov r11, r0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[0] * B[3] + ldr lr, [r2, #12] + adcs r6, r6, #0 + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[0] * B[5] + ldr lr, [r2, #20] + adcs r8, r8, #0 + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[0] * B[7] + ldr lr, [r2, #28] + adcs r10, r10, #0 + adc r3, r0, #0 + umlal r10, r3, r12, lr + # A[1] * B[0] + ldr r12, [r1, #4] + ldr lr, [r2] + mov r11, #0 + umlal r4, r11, r12, lr + str r4, [sp, #4] + adds r5, r5, r11 + # A[1] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[1] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[1] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[1] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[1] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[1] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[1] * B[7] + ldr lr, [r2, #28] + adc r4, r0, #0 + umlal r3, r4, r12, lr + # A[2] * B[0] + ldr r12, [r1, #8] + ldr lr, [r2] + mov r11, #0 + umlal r5, r11, r12, lr + str r5, [sp, #8] + adds r6, r6, r11 + # A[2] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[2] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[2] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[2] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[2] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[2] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[2] * B[7] + ldr lr, [r2, #28] + adc r5, r0, #0 + umlal r4, r5, r12, lr + # A[3] * B[0] + ldr r12, [r1, #12] + ldr lr, [r2] + mov r11, #0 + umlal r6, r11, r12, lr + str r6, [sp, #12] + adds r7, r7, r11 + # A[3] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[3] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[3] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[3] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[3] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[3] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[3] * B[7] + ldr lr, [r2, #28] + adc r6, r0, #0 + umlal r5, r6, r12, lr + # A[4] * B[0] + ldr r12, [r1, #16] + ldr lr, [r2] + mov r11, #0 + umlal r7, r11, r12, lr + str r7, [sp, #16] + adds r8, r8, r11 + # A[4] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[4] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[4] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[4] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[4] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[4] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[4] * B[7] + ldr lr, [r2, #28] + adc r7, r0, #0 + umlal r6, r7, r12, lr + # A[5] * B[0] + ldr r12, [r1, #20] + ldr lr, [r2] + mov r11, #0 + umlal r8, r11, r12, lr + str r8, [sp, #20] + adds r9, r9, r11 + # A[5] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[5] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[5] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[5] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[5] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[5] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[5] * B[7] + ldr lr, [r2, #28] + adc r8, r0, #0 + umlal r7, r8, r12, lr + # A[6] * B[0] + ldr r12, [r1, #24] + ldr lr, [r2] + mov r11, #0 + umlal r9, r11, r12, lr + str r9, [sp, #24] + adds r10, r10, r11 + # A[6] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[6] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[6] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[6] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[6] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[6] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[6] * B[7] + ldr lr, [r2, #28] + adc r9, r0, #0 + umlal r8, r9, r12, lr + # A[7] * B[0] + ldr r12, [r1, #28] + ldr lr, [r2] + mov r11, #0 + umlal r10, r11, r12, lr + str r10, [sp, #28] + adds r3, r3, r11 + # A[7] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[7] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[7] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[7] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[7] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[7] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[7] * B[7] + ldr lr, [r2, #28] + adc r10, r0, #0 + umlal r9, r10, r12, lr + # Reduce + ldr r2, [sp, #28] + mov lr, sp + mov r12, #38 + umull r10, r11, r12, r10 + adds r10, r10, r2 + adc r11, r11, #0 + mov r12, #19 + lsl r11, r11, #1 + orr r11, r11, r10, LSR #31 + mul r11, r12, r11 + ldm lr!, {r1, r2} + mov r12, #38 + adds r1, r1, r11 + adc r11, r0, #0 + umlal r1, r11, r3, r12 + adds r2, r2, r11 + adc r11, r0, #0 + umlal r2, r11, r4, r12 + ldm lr!, {r3, r4} + adds r3, r3, r11 + adc r11, r0, #0 + umlal r3, r11, r5, r12 + adds r4, r4, r11 + adc r11, r0, #0 + umlal r4, r11, r6, r12 + ldm lr!, {r5, r6} + adds r5, r5, r11 + adc r11, r0, #0 + umlal r5, r11, r7, r12 + adds r6, r6, r11 + adc r11, r0, #0 + umlal r6, r11, r8, r12 + ldm lr!, {r7, r8} + adds r7, r7, r11 + adc r11, r0, #0 + umlal r7, r11, r9, r12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r10, r10, #0x80000000 +#else + bfc r10, #31, #1 +#endif + adds r8, r10, r11 + # Store + ldr r0, [sp, #36] + stm r0, {r1, r2, r3, r4, r5, r6, r7, r8} + add sp, sp, #40 + pop {pc} + .size fe_mul_op,.-fe_mul_op +#else .text .align 4 .globl fe_mul_op @@ -2388,7 +2838,7 @@ fe_cmov_table: fe_mul_op: push {lr} sub sp, sp, #44 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r0, [sp, #36] str r1, [sp, #40] #else @@ -2508,7 +2958,11 @@ fe_mul_op: umaal r4, r11, r7, lr umaal r5, r11, r8, lr pop {r6} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r10, r10, #0x80000000 +#else bfc r10, #31, #1 +#endif umaal r6, r11, r9, lr add r7, r10, r11 ldr lr, [sp, #8] @@ -2517,6 +2971,7 @@ fe_mul_op: add sp, sp, #16 pop {pc} .size fe_mul_op,.-fe_mul_op +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ .text .align 4 .globl fe_mul @@ -2526,6 +2981,282 @@ fe_mul: bl fe_mul_op pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_mul,.-fe_mul +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + .text + .align 4 + .globl fe_sq_op + .type fe_sq_op, %function +fe_sq_op: + push {lr} + sub sp, sp, #0x44 + str r0, [sp, #64] + # Square + mov r0, #0 + ldr r12, [r1] + # A[0] * A[1] + ldr lr, [r1, #4] + umull r4, r5, r12, lr + # A[0] * A[3] + ldr lr, [r1, #12] + umull r6, r7, r12, lr + # A[0] * A[5] + ldr lr, [r1, #20] + umull r8, r9, r12, lr + # A[0] * A[7] + ldr lr, [r1, #28] + umull r10, r3, r12, lr + # A[0] * A[2] + ldr lr, [r1, #8] + mov r11, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[0] * A[4] + ldr lr, [r1, #16] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[0] * A[6] + ldr lr, [r1, #24] + adcs r9, r9, #0 + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + adcs r3, r3, #0 + str r4, [sp, #4] + str r5, [sp, #8] + # A[1] * A[2] + ldr r12, [r1, #4] + ldr lr, [r1, #8] + mov r11, #0 + umlal r6, r11, r12, lr + str r6, [sp, #12] + adds r7, r7, r11 + # A[1] * A[3] + ldr lr, [r1, #12] + adc r11, r0, #0 + umlal r7, r11, r12, lr + str r7, [sp, #16] + adds r8, r8, r11 + # A[1] * A[4] + ldr lr, [r1, #16] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[1] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[1] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[1] * A[7] + ldr lr, [r1, #28] + adc r4, r0, #0 + umlal r3, r4, r12, lr + # A[2] * A[3] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + mov r11, #0 + umlal r8, r11, r12, lr + str r8, [sp, #20] + adds r9, r9, r11 + # A[2] * A[4] + ldr lr, [r1, #16] + adc r11, r0, #0 + umlal r9, r11, r12, lr + str r9, [sp, #24] + adds r10, r10, r11 + # A[2] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[2] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[2] * A[7] + ldr lr, [r1, #28] + adc r5, r0, #0 + umlal r4, r5, r12, lr + # A[3] * A[4] + ldr r12, [r1, #12] + ldr lr, [r1, #16] + mov r11, #0 + umlal r10, r11, r12, lr + str r10, [sp, #28] + adds r3, r3, r11 + # A[3] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[3] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[3] * A[7] + ldr lr, [r1, #28] + adc r6, r0, #0 + umlal r5, r6, r12, lr + # A[4] * A[5] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + mov r11, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[4] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[4] * A[7] + ldr lr, [r1, #28] + adc r7, r0, #0 + umlal r6, r7, r12, lr + # A[5] * A[6] + ldr r12, [r1, #20] + ldr lr, [r1, #24] + mov r11, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[5] * A[7] + ldr lr, [r1, #28] + adc r8, r0, #0 + umlal r7, r8, r12, lr + # A[6] * A[7] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + mov r9, #0 + umlal r8, r9, r12, lr + add lr, sp, #32 + stm lr, {r3, r4, r5, r6, r7, r8, r9} + add lr, sp, #4 + ldm lr, {r4, r5, r6, r7, r8, r9, r10} + adds r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + stm lr!, {r4, r5, r6, r7, r8, r9, r10} + ldm lr, {r3, r4, r5, r6, r7, r8, r9} + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r0, #0 + stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + add lr, sp, #4 + ldm lr, {r4, r5, r6, r7, r8, r9, r10} + mov lr, sp + # A[0] * A[0] + ldr r12, [r1] + umull r3, r11, r12, r12 + adds r4, r4, r11 + # A[1] * A[1] + ldr r12, [r1, #4] + adcs r5, r5, #0 + adc r11, r0, #0 + umlal r5, r11, r12, r12 + adds r6, r6, r11 + # A[2] * A[2] + ldr r12, [r1, #8] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, r12 + adds r8, r8, r11 + # A[3] * A[3] + ldr r12, [r1, #12] + adcs r9, r9, #0 + adc r11, r0, #0 + umlal r9, r11, r12, r12 + adds r10, r10, r11 + stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10} + ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + # A[4] * A[4] + ldr r12, [r1, #16] + adcs r3, r3, #0 + adc r11, r0, #0 + umlal r3, r11, r12, r12 + adds r4, r4, r11 + # A[5] * A[5] + ldr r12, [r1, #20] + adcs r5, r5, #0 + adc r11, r0, #0 + umlal r5, r11, r12, r12 + adds r6, r6, r11 + # A[6] * A[6] + ldr r12, [r1, #24] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, r12 + adds r8, r8, r11 + # A[7] * A[7] + ldr r12, [r1, #28] + adcs r9, r9, #0 + adc r10, r10, #0 + umlal r9, r10, r12, r12 + # Reduce + ldr r2, [sp, #28] + mov lr, sp + mov r12, #38 + umull r10, r11, r12, r10 + adds r10, r10, r2 + adc r11, r11, #0 + mov r12, #19 + lsl r11, r11, #1 + orr r11, r11, r10, LSR #31 + mul r11, r12, r11 + ldm lr!, {r1, r2} + mov r12, #38 + adds r1, r1, r11 + adc r11, r0, #0 + umlal r1, r11, r3, r12 + adds r2, r2, r11 + adc r11, r0, #0 + umlal r2, r11, r4, r12 + ldm lr!, {r3, r4} + adds r3, r3, r11 + adc r11, r0, #0 + umlal r3, r11, r5, r12 + adds r4, r4, r11 + adc r11, r0, #0 + umlal r4, r11, r6, r12 + ldm lr!, {r5, r6} + adds r5, r5, r11 + adc r11, r0, #0 + umlal r5, r11, r7, r12 + adds r6, r6, r11 + adc r11, r0, #0 + umlal r6, r11, r8, r12 + ldm lr!, {r7, r8} + adds r7, r7, r11 + adc r11, r0, #0 + umlal r7, r11, r9, r12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r10, r10, #0x80000000 +#else + bfc r10, #31, #1 +#endif + adds r8, r10, r11 + # Store + ldr r0, [sp, #64] + stm r0, {r1, r2, r3, r4, r5, r6, r7, r8} + add sp, sp, #0x44 + pop {pc} + .size fe_sq_op,.-fe_sq_op +#else .text .align 4 .globl fe_sq_op @@ -2550,7 +3281,7 @@ fe_sq_op: umaal r9, r12, r1, r2 adcs r9, r9, r9 umaal r9, r11, lr, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #8] str r9, [sp, #12] #else @@ -2640,7 +3371,11 @@ fe_sq_op: mov r12, r6 pop {r5, r6} umaal r5, lr, r8, r12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r7, r7, #0x80000000 +#else bfc r7, #31, #1 +#endif umaal r6, lr, r9, r12 add r7, r7, lr pop {lr} @@ -2648,6 +3383,7 @@ fe_sq_op: stm lr, {r0, r1, r2, r3, r4, r5, r6, r7} pop {pc} .size fe_sq_op,.-fe_sq_op +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ .text .align 4 .globl fe_sq @@ -2658,6 +3394,7 @@ fe_sq: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_sq,.-fe_sq #ifdef HAVE_CURVE25519 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) .text .align 4 .globl fe_mul121666 @@ -2666,7 +3403,81 @@ fe_mul121666: push {r4, r5, r6, r7, r8, r9, r10, lr} # Multiply by 121666 ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #1 + lsl r10, r10, #8 + orr r10, r10, #0xdb + lsl r10, r10, #8 + orr r10, r10, #0x42 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xdb + lsl r10, r10, #8 + add r10, r10, #0x42 +#else + mov r10, #0xdb42 +#endif + movt r10, #1 +#endif + umull r2, r12, r10, r2 + umull r3, lr, r10, r3 + adds r3, r3, r12 + adc lr, lr, #0 + umull r4, r12, r10, r4 + adds r4, r4, lr + adc r12, r12, #0 + umull r5, lr, r10, r5 + adds r5, r5, r12 + adc lr, lr, #0 + umull r6, r12, r10, r6 + adds r6, r6, lr + adc r12, r12, #0 + umull r7, lr, r10, r7 + adds r7, r7, r12 + adc lr, lr, #0 + umull r8, r12, r10, r8 + adds r8, r8, lr + adc r12, r12, #0 + umull r9, lr, r10, r9 + adds r9, r9, r12 + mov r10, #19 + adc lr, lr, #0 + lsl lr, lr, #1 + orr lr, lr, r9, LSR #31 + mul lr, r10, lr + adds r2, r2, lr + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else + bfc r9, #31, #1 +#endif + adcs r8, r8, #0 + adc r9, r9, #0 + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + pop {r4, r5, r6, r7, r8, r9, r10, pc} + .size fe_mul121666,.-fe_mul121666 +#else + .text + .align 4 + .globl fe_mul121666 + .type fe_mul121666, %function +fe_mul121666: + push {r4, r5, r6, r7, r8, r9, r10, lr} + # Multiply by 121666 + ldm r1, {r2, r3, r4, r5, r6, r7, r8, r9} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #1 + lsl lr, lr, #8 + orr lr, lr, #0xdb + lsl lr, lr, #8 + orr lr, lr, #0x42 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov lr, #0xdb lsl lr, lr, #8 add lr, lr, #0x42 @@ -2674,31 +3485,37 @@ fe_mul121666: mov lr, #0xdb42 #endif movt lr, #1 - umull r2, r10, r2, lr +#endif + umull r2, r10, lr, r2 sub r12, lr, #1 - umaal r3, r10, r3, r12 - umaal r4, r10, r4, r12 - umaal r5, r10, r5, r12 - umaal r6, r10, r6, r12 - umaal r7, r10, r7, r12 - umaal r8, r10, r8, r12 + umaal r3, r10, r12, r3 + umaal r4, r10, r12, r4 + umaal r5, r10, r12, r5 + umaal r6, r10, r12, r6 + umaal r7, r10, r12, r7 + umaal r8, r10, r12, r8 mov lr, #19 - umaal r9, r10, r9, r12 + umaal r9, r10, r12, r9 lsl r10, r10, #1 orr r10, r10, r9, lsr #31 - mul r10, r10, lr + mul r10, lr, r10 adds r2, r2, r10 adcs r3, r3, #0 adcs r4, r4, #0 adcs r5, r5, #0 adcs r6, r6, #0 adcs r7, r7, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0x80000000 +#else bfc r9, #31, #1 +#endif adcs r8, r8, #0 adc r9, r9, #0 stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} pop {r4, r5, r6, r7, r8, r9, r10, pc} .size fe_mul121666,.-fe_mul121666 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -3359,7 +4176,11 @@ L_curve25519_inv_8: adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif adcs r10, r10, #0 adc r11, r11, #0 stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -3528,6 +4349,323 @@ L_fe_invert8: add sp, sp, #0x88 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size fe_invert,.-fe_invert +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + .text + .align 4 + .globl fe_sq2 + .type fe_sq2, %function +fe_sq2: + push {lr} + sub sp, sp, #0x44 + str r0, [sp, #64] + # Square * 2 + mov r0, #0 + ldr r12, [r1] + # A[0] * A[1] + ldr lr, [r1, #4] + umull r4, r5, r12, lr + # A[0] * A[3] + ldr lr, [r1, #12] + umull r6, r7, r12, lr + # A[0] * A[5] + ldr lr, [r1, #20] + umull r8, r9, r12, lr + # A[0] * A[7] + ldr lr, [r1, #28] + umull r10, r3, r12, lr + # A[0] * A[2] + ldr lr, [r1, #8] + mov r11, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[0] * A[4] + ldr lr, [r1, #16] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[0] * A[6] + ldr lr, [r1, #24] + adcs r9, r9, #0 + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + adcs r3, r3, #0 + str r4, [sp, #4] + str r5, [sp, #8] + # A[1] * A[2] + ldr r12, [r1, #4] + ldr lr, [r1, #8] + mov r11, #0 + umlal r6, r11, r12, lr + str r6, [sp, #12] + adds r7, r7, r11 + # A[1] * A[3] + ldr lr, [r1, #12] + adc r11, r0, #0 + umlal r7, r11, r12, lr + str r7, [sp, #16] + adds r8, r8, r11 + # A[1] * A[4] + ldr lr, [r1, #16] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[1] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[1] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[1] * A[7] + ldr lr, [r1, #28] + adc r4, r0, #0 + umlal r3, r4, r12, lr + # A[2] * A[3] + ldr r12, [r1, #8] + ldr lr, [r1, #12] + mov r11, #0 + umlal r8, r11, r12, lr + str r8, [sp, #20] + adds r9, r9, r11 + # A[2] * A[4] + ldr lr, [r1, #16] + adc r11, r0, #0 + umlal r9, r11, r12, lr + str r9, [sp, #24] + adds r10, r10, r11 + # A[2] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[2] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[2] * A[7] + ldr lr, [r1, #28] + adc r5, r0, #0 + umlal r4, r5, r12, lr + # A[3] * A[4] + ldr r12, [r1, #12] + ldr lr, [r1, #16] + mov r11, #0 + umlal r10, r11, r12, lr + str r10, [sp, #28] + adds r3, r3, r11 + # A[3] * A[5] + ldr lr, [r1, #20] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[3] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[3] * A[7] + ldr lr, [r1, #28] + adc r6, r0, #0 + umlal r5, r6, r12, lr + # A[4] * A[5] + ldr r12, [r1, #16] + ldr lr, [r1, #20] + mov r11, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[4] * A[6] + ldr lr, [r1, #24] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[4] * A[7] + ldr lr, [r1, #28] + adc r7, r0, #0 + umlal r6, r7, r12, lr + # A[5] * A[6] + ldr r12, [r1, #20] + ldr lr, [r1, #24] + mov r11, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[5] * A[7] + ldr lr, [r1, #28] + adc r8, r0, #0 + umlal r7, r8, r12, lr + # A[6] * A[7] + ldr r12, [r1, #24] + ldr lr, [r1, #28] + mov r9, #0 + umlal r8, r9, r12, lr + add lr, sp, #32 + stm lr, {r3, r4, r5, r6, r7, r8, r9} + add lr, sp, #4 + ldm lr, {r4, r5, r6, r7, r8, r9, r10} + adds r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + stm lr!, {r4, r5, r6, r7, r8, r9, r10} + ldm lr, {r3, r4, r5, r6, r7, r8, r9} + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adc r10, r0, #0 + stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + add lr, sp, #4 + ldm lr, {r4, r5, r6, r7, r8, r9, r10} + mov lr, sp + # A[0] * A[0] + ldr r12, [r1] + umull r3, r11, r12, r12 + adds r4, r4, r11 + # A[1] * A[1] + ldr r12, [r1, #4] + adcs r5, r5, #0 + adc r11, r0, #0 + umlal r5, r11, r12, r12 + adds r6, r6, r11 + # A[2] * A[2] + ldr r12, [r1, #8] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, r12 + adds r8, r8, r11 + # A[3] * A[3] + ldr r12, [r1, #12] + adcs r9, r9, #0 + adc r11, r0, #0 + umlal r9, r11, r12, r12 + adds r10, r10, r11 + stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10} + ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + # A[4] * A[4] + ldr r12, [r1, #16] + adcs r3, r3, #0 + adc r11, r0, #0 + umlal r3, r11, r12, r12 + adds r4, r4, r11 + # A[5] * A[5] + ldr r12, [r1, #20] + adcs r5, r5, #0 + adc r11, r0, #0 + umlal r5, r11, r12, r12 + adds r6, r6, r11 + # A[6] * A[6] + ldr r12, [r1, #24] + adcs r7, r7, #0 + adc r11, r0, #0 + umlal r7, r11, r12, r12 + adds r8, r8, r11 + # A[7] * A[7] + ldr r12, [r1, #28] + adcs r9, r9, #0 + adc r10, r10, #0 + umlal r9, r10, r12, r12 + # Reduce + ldr r2, [sp, #28] + mov lr, sp + mov r12, #38 + umull r10, r11, r12, r10 + adds r10, r10, r2 + adc r11, r11, #0 + mov r12, #19 + lsl r11, r11, #1 + orr r11, r11, r10, LSR #31 + mul r11, r12, r11 + ldm lr!, {r1, r2} + mov r12, #38 + adds r1, r1, r11 + adc r11, r0, #0 + umlal r1, r11, r3, r12 + adds r2, r2, r11 + adc r11, r0, #0 + umlal r2, r11, r4, r12 + ldm lr!, {r3, r4} + adds r3, r3, r11 + adc r11, r0, #0 + umlal r3, r11, r5, r12 + adds r4, r4, r11 + adc r11, r0, #0 + umlal r4, r11, r6, r12 + ldm lr!, {r5, r6} + adds r5, r5, r11 + adc r11, r0, #0 + umlal r5, r11, r7, r12 + adds r6, r6, r11 + adc r11, r0, #0 + umlal r6, r11, r8, r12 + ldm lr!, {r7, r8} + adds r7, r7, r11 + adc r11, r0, #0 + umlal r7, r11, r9, r12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r10, r10, #0x80000000 +#else + bfc r10, #31, #1 +#endif + adds r8, r10, r11 + # Reduce if top bit set + mov r12, #19 + and r11, r12, r8, ASR #31 + adds r1, r1, r11 + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r8, r8, #0x80000000 +#else + bfc r8, #31, #1 +#endif + adcs r7, r7, #0 + adc r8, r8, #0 + # Double + adds r1, r1, r1 + adcs r2, r2, r2 + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + adcs r7, r7, r7 + adc r8, r8, r8 + # Reduce if top bit set + mov r12, #19 + and r11, r12, r8, ASR #31 + adds r1, r1, r11 + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r8, r8, #0x80000000 +#else + bfc r8, #31, #1 +#endif + adcs r7, r7, #0 + adc r8, r8, #0 + # Store + ldr r0, [sp, #64] + stm r0, {r1, r2, r3, r4, r5, r6, r7, r8} + add sp, sp, #0x44 + pop {pc} + .size fe_sq2,.-fe_sq2 +#else .text .align 4 .globl fe_sq2 @@ -3535,7 +4673,7 @@ L_fe_invert8: fe_sq2: push {lr} sub sp, sp, #36 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r0, [sp, #28] str r1, [sp, #32] #else @@ -3557,7 +4695,7 @@ fe_sq2: umaal r9, r12, r1, r2 adcs r9, r9, r9 umaal r9, r11, lr, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #8] str r9, [sp, #12] #else @@ -3647,7 +4785,11 @@ fe_sq2: mov r12, r6 pop {r5, r6} umaal r5, lr, r8, r12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r7, r7, #0x80000000 +#else bfc r7, #31, #1 +#endif umaal r6, lr, r9, r12 add r7, r7, lr # Reduce if top bit set @@ -3659,7 +4801,11 @@ fe_sq2: adcs r3, r3, #0 adcs r4, r4, #0 adcs r5, r5, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r7, r7, #0x80000000 +#else bfc r7, #31, #1 +#endif adcs r6, r6, #0 adc r7, r7, #0 # Double @@ -3680,7 +4826,11 @@ fe_sq2: adcs r3, r3, #0 adcs r4, r4, #0 adcs r5, r5, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r7, r7, #0x80000000 +#else bfc r7, #31, #1 +#endif adcs r6, r6, #0 adc r7, r7, #0 pop {r12, lr} @@ -3690,6 +4840,7 @@ fe_sq2: mov r1, lr pop {pc} .size fe_sq2,.-fe_sq2 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ .text .align 4 .globl fe_pow22523 @@ -4014,7 +5165,11 @@ ge_madd: adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif adcs r10, r10, #0 adc r11, r11, #0 stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -4091,7 +5246,11 @@ ge_msub: adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif adcs r10, r10, #0 adc r11, r11, #0 stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -4164,7 +5323,11 @@ ge_add: adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif adcs r10, r10, #0 adc r11, r11, #0 stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -4242,7 +5405,11 @@ ge_sub: adcs r7, r7, #0 adcs r8, r8, #0 adcs r9, r9, #0 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0x80000000 +#else bfc r11, #31, #1 +#endif adcs r10, r10, #0 adc r11, r11, #0 stm r0, {r4, r5, r6, r7, r8, r9, r10, r11} @@ -4259,37 +5426,52 @@ ge_sub: add sp, sp, #44 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size ge_sub,.-ge_sub +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) .text .align 4 .globl sc_reduce .type sc_reduce, %function sc_reduce: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #52 + sub sp, sp, #56 + str r0, [sp, #52] # Load bits 252-511 add r0, r0, #28 ldm r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} lsr lr, r9, #24 lsl r9, r9, #4 - orr r9, r9, r8, lsr #28 + orr r9, r9, r8, LSR #28 lsl r8, r8, #4 - orr r8, r8, r7, lsr #28 + orr r8, r8, r7, LSR #28 lsl r7, r7, #4 - orr r7, r7, r6, lsr #28 + orr r7, r7, r6, LSR #28 lsl r6, r6, #4 - orr r6, r6, r5, lsr #28 + orr r6, r6, r5, LSR #28 lsl r5, r5, #4 - orr r5, r5, r4, lsr #28 + orr r5, r5, r4, LSR #28 lsl r4, r4, #4 - orr r4, r4, r3, lsr #28 + orr r4, r4, r3, LSR #28 lsl r3, r3, #4 - orr r3, r3, r2, lsr #28 + orr r3, r3, r2, LSR #28 lsl r2, r2, #4 - orr r2, r2, r1, lsr #28 + orr r2, r2, r1, LSR #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else bfc r9, #28, #4 +#endif sub r0, r0, #28 # Add order times bits 504..511 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xa3 + lsl r10, r10, #8 + orr r10, r10, #10 + lsl r10, r10, #8 + orr r10, r10, #44 + lsl r10, r10, #8 + orr r10, r10, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0x2c lsl r10, r10, #8 add r10, r10, #0x13 @@ -4297,7 +5479,17 @@ sc_reduce: mov r10, #0x2c13 #endif movt r10, #0xa30a -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xa7 + lsl r11, r11, #8 + orr r11, r11, #0xed + lsl r11, r11, #8 + orr r11, r11, #0x9c + lsl r11, r11, #8 + orr r11, r11, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x9c lsl r11, r11, #8 add r11, r11, #0xe5 @@ -4305,10 +5497,23 @@ sc_reduce: mov r11, #0x9ce5 #endif movt r11, #0xa7ed +#endif mov r1, #0 umlal r2, r1, r10, lr - umaal r3, r1, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + adds r3, r3, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r3, r1, r11, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5d + lsl r10, r10, #8 + orr r10, r10, #8 + lsl r10, r10, #8 + orr r10, r10, #0x63 + lsl r10, r10, #8 + orr r10, r10, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0x63 lsl r10, r10, #8 add r10, r10, #0x29 @@ -4316,7 +5521,17 @@ sc_reduce: mov r10, #0x6329 #endif movt r10, #0x5d08 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xeb + lsl r11, r11, #8 + orr r11, r11, #33 + lsl r11, r11, #8 + orr r11, r11, #6 + lsl r11, r11, #8 + orr r11, r11, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x6 lsl r11, r11, #8 add r11, r11, #0x21 @@ -4324,8 +5539,15 @@ sc_reduce: mov r11, #0x621 #endif movt r11, #0xeb21 - umaal r4, r1, r10, lr - umaal r5, r1, r11, lr +#endif + adds r4, r4, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r4, r1, r10, lr + adds r5, r5, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r5, r1, r11, lr adds r6, r6, r1 adcs r7, r7, #0 adcs r8, r8, #0 @@ -4335,7 +5557,17 @@ sc_reduce: sbcs r8, r8, #0 sbc r9, r9, #0 # Sub product of top 8 words and order -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r12, sp +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x2c lsl r1, r1, #8 add r1, r1, #0x13 @@ -4343,25 +5575,62 @@ sc_reduce: mov r1, #0x2c13 #endif movt r1, #0xa30a +#endif mov lr, #0 - ldm r0!, {r10, r11, r12} - umlal r10, lr, r2, r1 - umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm r0!, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} ldm r0!, {r10, r11} - umaal r10, lr, r8, r1 + umlal r10, lr, r2, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0xf0000000 +#else bfc r11, #28, #4 - umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} +#endif + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} sub r0, r0, #16 - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x9c lsl r1, r1, #8 add r1, r1, #0xe5 @@ -4369,23 +5638,56 @@ sc_reduce: mov r1, #0x9ce5 #endif movt r1, #0xa7ed +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 - umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} - umaal r10, lr, r8, r1 - umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x63 lsl r1, r1, #8 add r1, r1, #0x29 @@ -4393,23 +5695,56 @@ sc_reduce: mov r1, #0x6329 #endif movt r1, #0x5d08 +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 - umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} - umaal r10, lr, r8, r1 - umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x6 lsl r1, r1, #8 add r1, r1, #0x21 @@ -4417,48 +5752,83 @@ sc_reduce: mov r1, #0x621 #endif movt r1, #0xeb21 +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 - umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} - umaal r10, lr, r8, r1 - umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 # Subtract at 4 * 32 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} subs r10, r10, r2 sbcs r11, r11, r3 - sbcs r12, r12, r4 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - sbcs r10, r10, r5 - sbcs r11, r11, r6 - sbcs r12, r12, r7 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r4 + sbcs r11, r11, r5 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r6 + sbcs r11, r11, r7 + stm r12!, {r10, r11} + ldm r12, {r10, r11} sbcs r10, r10, r8 sbc r11, r11, r9 - stm sp!, {r10, r11} - sub sp, sp, #36 + stm r12!, {r10, r11} + sub r12, r12, #36 asr lr, r11, #25 # Conditionally subtract order starting at bit 125 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0xa00000 lsl r1, r1, #8 add r1, r1, #0x0 #else mov r1, #0xa0000000 #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0x4b + lsl r2, r2, #8 + orr r2, r2, #0x9e + lsl r2, r2, #8 + orr r2, r2, #0xba + lsl r2, r2, #8 + orr r2, r2, #0x7d +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r2, #0xba lsl r2, r2, #8 add r2, r2, #0x7d @@ -4466,7 +5836,17 @@ sc_reduce: mov r2, #0xba7d #endif movt r2, #0x4b9e -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0xcb + lsl r3, r3, #8 + orr r3, r3, #2 + lsl r3, r3, #8 + orr r3, r3, #0x4c + lsl r3, r3, #8 + orr r3, r3, #0x63 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x4c lsl r3, r3, #8 add r3, r3, #0x63 @@ -4474,7 +5854,17 @@ sc_reduce: mov r3, #0x4c63 #endif movt r3, #0xcb02 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xd4 + lsl r4, r4, #8 + orr r4, r4, #0x5e + lsl r4, r4, #8 + orr r4, r4, #0xf3 + lsl r4, r4, #8 + orr r4, r4, #0x9a +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r4, #0xf3 lsl r4, r4, #8 add r4, r4, #0x9a @@ -4482,7 +5872,17 @@ sc_reduce: mov r4, #0xf39a #endif movt r4, #0xd45e -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #2 + lsl r5, r5, #8 + orr r5, r5, #0x9b + lsl r5, r5, #8 + orr r5, r5, #0xdf + lsl r5, r5, #8 + orr r5, r5, #59 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r5, #0xdf lsl r5, r5, #8 add r5, r5, #0x3b @@ -4490,7 +5890,8 @@ sc_reduce: mov r5, #0xdf3b #endif movt r5, #0x29b -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r9, #0x20000 lsl r9, r9, #8 add r9, r9, #0x0 @@ -4503,26 +5904,30 @@ sc_reduce: and r4, r4, lr and r5, r5, lr and r9, r9, lr - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} adds r10, r10, r1 adcs r11, r11, r2 - adcs r12, r12, r3 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - adcs r10, r10, r4 - adcs r11, r11, r5 - adcs r12, r12, #0 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r3 + adcs r11, r11, r4 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r5 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10, r11} adcs r10, r10, #0 adcs r11, r11, #0 - adcs r12, r12, r9 - stm sp!, {r10, r11, r12} - sub sp, sp, #48 + stm r12!, {r10, r11} + ldm r12, {r10} + adcs r10, r10, #0 + stm r12!, {r10} sub r0, r0, #16 + mov r12, sp # Load bits 252-376 - add sp, sp, #28 - ldm sp, {r1, r2, r3, r4, r5} + add r12, r12, #28 + ldm r12, {r1, r2, r3, r4, r5} lsl r5, r5, #4 orr r5, r5, r4, lsr #28 lsl r4, r4, #4 @@ -4531,11 +5936,25 @@ sc_reduce: orr r3, r3, r2, lsr #28 lsl r2, r2, #4 orr r2, r2, r1, lsr #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r5, r5, #0xe0000000 +#else bfc r5, #29, #3 - sub sp, sp, #28 - # Sub product of top 8 words and order +#endif + sub r12, r12, #28 + # Sub product of top 4 words and order + mov r0, sp # * -5cf5d3ed -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x2c lsl r1, r1, #8 add r1, r1, #0x13 @@ -4543,16 +5962,35 @@ sc_reduce: mov r1, #0x2c13 #endif movt r1, #0xa30a +#endif mov lr, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, lr, r2, r1 - umaal r7, lr, r3, r1 - umaal r8, lr, r4, r1 - umaal r9, lr, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + adds r7, r7, lr + mov lr, #0 + adc lr, lr, #0 + umlal r7, lr, r3, r1 + adds r8, r8, lr + mov lr, #0 + adc lr, lr, #0 + umlal r8, lr, r4, r1 + adds r9, r9, lr + mov lr, #0 + adc lr, lr, #0 + umlal r9, lr, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -5812631b -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x9c lsl r1, r1, #8 add r1, r1, #0xe5 @@ -4560,16 +5998,35 @@ sc_reduce: mov r1, #0x9ce5 #endif movt r1, #0xa7ed +#endif mov r10, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r10, r2, r1 - umaal r7, r10, r3, r1 - umaal r8, r10, r4, r1 - umaal r9, r10, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + adds r7, r7, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r7, r10, r3, r1 + adds r8, r8, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r8, r10, r4, r1 + adds r9, r9, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r9, r10, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -a2f79cd7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x63 lsl r1, r1, #8 add r1, r1, #0x29 @@ -4577,16 +6034,35 @@ sc_reduce: mov r1, #0x6329 #endif movt r1, #0x5d08 +#endif mov r11, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r11, r2, r1 - umaal r7, r11, r3, r1 - umaal r8, r11, r4, r1 - umaal r9, r11, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + adds r7, r7, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r7, r11, r3, r1 + adds r8, r8, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r8, r11, r4, r1 + adds r9, r9, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r9, r11, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -14def9df -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x6 lsl r1, r1, #8 add r1, r1, #0x21 @@ -4594,17 +6070,31 @@ sc_reduce: mov r1, #0x621 #endif movt r1, #0xeb21 +#endif mov r12, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r12, r2, r1 - umaal r7, r12, r3, r1 - umaal r8, r12, r4, r1 - umaal r9, r12, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + adds r7, r7, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r7, r12, r3, r1 + adds r8, r8, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r8, r12, r4, r1 + adds r9, r9, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r9, r12, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # Add overflows at 4 * 32 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else bfc r9, #28, #4 +#endif adds r6, r6, lr adcs r7, r7, r10 adcs r8, r8, r11 @@ -4615,9 +6105,18 @@ sc_reduce: sbcs r8, r8, r4 sbcs r9, r9, r5 sbc r1, r1, r1 - sub sp, sp, #16 - ldm sp, {r2, r3, r4, r5} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + sub r0, r0, #16 + ldm r0, {r2, r3, r4, r5} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5c + lsl r10, r10, #8 + orr r10, r10, #0xf5 + lsl r10, r10, #8 + orr r10, r10, #0xd3 + lsl r10, r10, #8 + orr r10, r10, #0xed +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0xd3 lsl r10, r10, #8 add r10, r10, #0xed @@ -4625,7 +6124,17 @@ sc_reduce: mov r10, #0xd3ed #endif movt r10, #0x5cf5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x58 + lsl r11, r11, #8 + orr r11, r11, #18 + lsl r11, r11, #8 + orr r11, r11, #0x63 + lsl r11, r11, #8 + orr r11, r11, #26 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x63 lsl r11, r11, #8 add r11, r11, #0x1a @@ -4633,7 +6142,17 @@ sc_reduce: mov r11, #0x631a #endif movt r11, #0x5812 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0xa2 + lsl r12, r12, #8 + orr r12, r12, #0xf7 + lsl r12, r12, #8 + orr r12, r12, #0x9c + lsl r12, r12, #8 + orr r12, r12, #0xd6 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r12, #0x9c lsl r12, r12, #8 add r12, r12, #0xd6 @@ -4641,7 +6160,17 @@ sc_reduce: mov r12, #0x9cd6 #endif movt r12, #0xa2f7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #20 + lsl lr, lr, #8 + orr lr, lr, #0xde + lsl lr, lr, #8 + orr lr, lr, #0xf9 + lsl lr, lr, #8 + orr lr, lr, #0xde +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov lr, #0xf9 lsl lr, lr, #8 add lr, lr, #0xde @@ -4649,6 +6178,7 @@ sc_reduce: mov lr, #0xf9de #endif movt lr, #0x14de +#endif and r10, r10, r1 and r11, r11, r1 and r12, r12, r1 @@ -4662,20 +6192,1809 @@ sc_reduce: and r1, r1, #0x10000000 adcs r8, r8, #0 adc r9, r9, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else bfc r9, #28, #4 +#endif # Store result + ldr r0, [sp, #52] stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} - add sp, sp, #52 + add sp, sp, #56 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size sc_reduce,.-sc_reduce +#else + .text + .align 4 + .globl sc_reduce + .type sc_reduce, %function +sc_reduce: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #56 + str r0, [sp, #52] + # Load bits 252-511 + add r0, r0, #28 + ldm r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} + lsr lr, r9, #24 + lsl r9, r9, #4 + orr r9, r9, r8, LSR #28 + lsl r8, r8, #4 + orr r8, r8, r7, LSR #28 + lsl r7, r7, #4 + orr r7, r7, r6, LSR #28 + lsl r6, r6, #4 + orr r6, r6, r5, LSR #28 + lsl r5, r5, #4 + orr r5, r5, r4, LSR #28 + lsl r4, r4, #4 + orr r4, r4, r3, LSR #28 + lsl r3, r3, #4 + orr r3, r3, r2, LSR #28 + lsl r2, r2, #4 + orr r2, r2, r1, LSR #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + sub r0, r0, #28 + # Add order times bits 504..511 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xa3 + lsl r10, r10, #8 + orr r10, r10, #10 + lsl r10, r10, #8 + orr r10, r10, #44 + lsl r10, r10, #8 + orr r10, r10, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x2c + lsl r10, r10, #8 + add r10, r10, #0x13 +#else + mov r10, #0x2c13 +#endif + movt r10, #0xa30a +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xa7 + lsl r11, r11, #8 + orr r11, r11, #0xed + lsl r11, r11, #8 + orr r11, r11, #0x9c + lsl r11, r11, #8 + orr r11, r11, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x9c + lsl r11, r11, #8 + add r11, r11, #0xe5 +#else + mov r11, #0x9ce5 +#endif + movt r11, #0xa7ed +#endif + mov r1, #0 + umlal r2, r1, r10, lr + umaal r3, r1, r11, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5d + lsl r10, r10, #8 + orr r10, r10, #8 + lsl r10, r10, #8 + orr r10, r10, #0x63 + lsl r10, r10, #8 + orr r10, r10, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x63 + lsl r10, r10, #8 + add r10, r10, #0x29 +#else + mov r10, #0x6329 +#endif + movt r10, #0x5d08 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xeb + lsl r11, r11, #8 + orr r11, r11, #33 + lsl r11, r11, #8 + orr r11, r11, #6 + lsl r11, r11, #8 + orr r11, r11, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x6 + lsl r11, r11, #8 + add r11, r11, #0x21 +#else + mov r11, #0x621 +#endif + movt r11, #0xeb21 +#endif + umaal r4, r1, r10, lr + umaal r5, r1, r11, lr + adds r6, r6, r1 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + subs r6, r6, lr + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbc r9, r9, #0 + # Sub product of top 8 words and order + mov r12, sp +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a +#endif + mov lr, #0 + ldm r0!, {r10, r11} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + umaal r10, lr, r8, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0xf0000000 +#else + bfc r11, #28, #4 +#endif + umaal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r0, r0, #16 + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + umaal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r8, r1 + umaal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 + # Subtract at 4 * 32 + ldm r12, {r10, r11} + subs r10, r10, r2 + sbcs r11, r11, r3 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r4 + sbcs r11, r11, r5 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r6 + sbcs r11, r11, r7 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r8 + sbc r11, r11, r9 + stm r12!, {r10, r11} + sub r12, r12, #36 + asr lr, r11, #25 + # Conditionally subtract order starting at bit 125 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa00000 + lsl r1, r1, #8 + add r1, r1, #0x0 +#else + mov r1, #0xa0000000 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0x4b + lsl r2, r2, #8 + orr r2, r2, #0x9e + lsl r2, r2, #8 + orr r2, r2, #0xba + lsl r2, r2, #8 + orr r2, r2, #0x7d +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0xba + lsl r2, r2, #8 + add r2, r2, #0x7d +#else + mov r2, #0xba7d +#endif + movt r2, #0x4b9e +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0xcb + lsl r3, r3, #8 + orr r3, r3, #2 + lsl r3, r3, #8 + orr r3, r3, #0x4c + lsl r3, r3, #8 + orr r3, r3, #0x63 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0x4c + lsl r3, r3, #8 + add r3, r3, #0x63 +#else + mov r3, #0x4c63 +#endif + movt r3, #0xcb02 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xd4 + lsl r4, r4, #8 + orr r4, r4, #0x5e + lsl r4, r4, #8 + orr r4, r4, #0xf3 + lsl r4, r4, #8 + orr r4, r4, #0x9a +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xf3 + lsl r4, r4, #8 + add r4, r4, #0x9a +#else + mov r4, #0xf39a +#endif + movt r4, #0xd45e +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #2 + lsl r5, r5, #8 + orr r5, r5, #0x9b + lsl r5, r5, #8 + orr r5, r5, #0xdf + lsl r5, r5, #8 + orr r5, r5, #59 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #0xdf + lsl r5, r5, #8 + add r5, r5, #0x3b +#else + mov r5, #0xdf3b +#endif + movt r5, #0x29b +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r9, #0x20000 + lsl r9, r9, #8 + add r9, r9, #0x0 +#else + mov r9, #0x2000000 +#endif + and r1, r1, lr + and r2, r2, lr + and r3, r3, lr + and r4, r4, lr + and r5, r5, lr + and r9, r9, lr + ldm r12, {r10, r11} + adds r10, r10, r1 + adcs r11, r11, r2 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r3 + adcs r11, r11, r4 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r5 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, #0 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10} + adcs r10, r10, #0 + stm r12!, {r10} + sub r0, r0, #16 + mov r12, sp + # Load bits 252-376 + add r12, r12, #28 + ldm r12, {r1, r2, r3, r4, r5} + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r5, r5, #0xe0000000 +#else + bfc r5, #29, #3 +#endif + sub r12, r12, #28 + # Sub product of top 4 words and order + mov r0, sp + # * -5cf5d3ed +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a +#endif + mov lr, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, lr, r2, r1 + umaal r7, lr, r3, r1 + umaal r8, lr, r4, r1 + umaal r9, lr, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -5812631b +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed +#endif + mov r10, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r10, r2, r1 + umaal r7, r10, r3, r1 + umaal r8, r10, r4, r1 + umaal r9, r10, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -a2f79cd7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 +#endif + mov r11, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r11, r2, r1 + umaal r7, r11, r3, r1 + umaal r8, r11, r4, r1 + umaal r9, r11, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -14def9df +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 +#endif + mov r12, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r12, r2, r1 + umaal r7, r12, r3, r1 + umaal r8, r12, r4, r1 + umaal r9, r12, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # Add overflows at 4 * 32 + ldm r0, {r6, r7, r8, r9} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + adds r6, r6, lr + adcs r7, r7, r10 + adcs r8, r8, r11 + adc r9, r9, r12 + # Subtract top at 4 * 32 + subs r6, r6, r2 + sbcs r7, r7, r3 + sbcs r8, r8, r4 + sbcs r9, r9, r5 + sbc r1, r1, r1 + sub r0, r0, #16 + ldm r0, {r2, r3, r4, r5} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5c + lsl r10, r10, #8 + orr r10, r10, #0xf5 + lsl r10, r10, #8 + orr r10, r10, #0xd3 + lsl r10, r10, #8 + orr r10, r10, #0xed +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xd3 + lsl r10, r10, #8 + add r10, r10, #0xed +#else + mov r10, #0xd3ed +#endif + movt r10, #0x5cf5 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x58 + lsl r11, r11, #8 + orr r11, r11, #18 + lsl r11, r11, #8 + orr r11, r11, #0x63 + lsl r11, r11, #8 + orr r11, r11, #26 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x63 + lsl r11, r11, #8 + add r11, r11, #0x1a +#else + mov r11, #0x631a +#endif + movt r11, #0x5812 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0xa2 + lsl r12, r12, #8 + orr r12, r12, #0xf7 + lsl r12, r12, #8 + orr r12, r12, #0x9c + lsl r12, r12, #8 + orr r12, r12, #0xd6 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0x9c + lsl r12, r12, #8 + add r12, r12, #0xd6 +#else + mov r12, #0x9cd6 +#endif + movt r12, #0xa2f7 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #20 + lsl lr, lr, #8 + orr lr, lr, #0xde + lsl lr, lr, #8 + orr lr, lr, #0xf9 + lsl lr, lr, #8 + orr lr, lr, #0xde +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #0xf9 + lsl lr, lr, #8 + add lr, lr, #0xde +#else + mov lr, #0xf9de +#endif + movt lr, #0x14de +#endif + and r10, r10, r1 + and r11, r11, r1 + and r12, r12, r1 + and lr, lr, r1 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, lr + adcs r6, r6, #0 + adcs r7, r7, #0 + and r1, r1, #0x10000000 + adcs r8, r8, #0 + adc r9, r9, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + # Store result + ldr r0, [sp, #52] + stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + add sp, sp, #56 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size sc_reduce,.-sc_reduce +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #ifdef HAVE_ED25519_SIGN +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) .text .align 4 .globl sc_muladd .type sc_muladd, %function sc_muladd: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} - sub sp, sp, #0x70 + sub sp, sp, #0x50 + add lr, sp, #0x44 + stm lr, {r0, r1, r3} + mov r0, #0 + ldr r12, [r1] + # A[0] * B[0] + ldr lr, [r2] + umull r3, r4, r12, lr + # A[0] * B[2] + ldr lr, [r2, #8] + umull r5, r6, r12, lr + # A[0] * B[4] + ldr lr, [r2, #16] + umull r7, r8, r12, lr + # A[0] * B[6] + ldr lr, [r2, #24] + umull r9, r10, r12, lr + str r3, [sp] + # A[0] * B[1] + ldr lr, [r2, #4] + mov r11, r0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[0] * B[3] + ldr lr, [r2, #12] + adcs r6, r6, #0 + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[0] * B[5] + ldr lr, [r2, #20] + adcs r8, r8, #0 + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[0] * B[7] + ldr lr, [r2, #28] + adcs r10, r10, #0 + adc r3, r0, #0 + umlal r10, r3, r12, lr + # A[1] * B[0] + ldr r12, [r1, #4] + ldr lr, [r2] + mov r11, #0 + umlal r4, r11, r12, lr + str r4, [sp, #4] + adds r5, r5, r11 + # A[1] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[1] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[1] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[1] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[1] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[1] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[1] * B[7] + ldr lr, [r2, #28] + adc r4, r0, #0 + umlal r3, r4, r12, lr + # A[2] * B[0] + ldr r12, [r1, #8] + ldr lr, [r2] + mov r11, #0 + umlal r5, r11, r12, lr + str r5, [sp, #8] + adds r6, r6, r11 + # A[2] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[2] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[2] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[2] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[2] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[2] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[2] * B[7] + ldr lr, [r2, #28] + adc r5, r0, #0 + umlal r4, r5, r12, lr + # A[3] * B[0] + ldr r12, [r1, #12] + ldr lr, [r2] + mov r11, #0 + umlal r6, r11, r12, lr + str r6, [sp, #12] + adds r7, r7, r11 + # A[3] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[3] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[3] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[3] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[3] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[3] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[3] * B[7] + ldr lr, [r2, #28] + adc r6, r0, #0 + umlal r5, r6, r12, lr + # A[4] * B[0] + ldr r12, [r1, #16] + ldr lr, [r2] + mov r11, #0 + umlal r7, r11, r12, lr + str r7, [sp, #16] + adds r8, r8, r11 + # A[4] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[4] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[4] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[4] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[4] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[4] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[4] * B[7] + ldr lr, [r2, #28] + adc r7, r0, #0 + umlal r6, r7, r12, lr + # A[5] * B[0] + ldr r12, [r1, #20] + ldr lr, [r2] + mov r11, #0 + umlal r8, r11, r12, lr + str r8, [sp, #20] + adds r9, r9, r11 + # A[5] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r9, r11, r12, lr + adds r10, r10, r11 + # A[5] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[5] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[5] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[5] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[5] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[5] * B[7] + ldr lr, [r2, #28] + adc r8, r0, #0 + umlal r7, r8, r12, lr + # A[6] * B[0] + ldr r12, [r1, #24] + ldr lr, [r2] + mov r11, #0 + umlal r9, r11, r12, lr + str r9, [sp, #24] + adds r10, r10, r11 + # A[6] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r10, r11, r12, lr + adds r3, r3, r11 + # A[6] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[6] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[6] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[6] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[6] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[6] * B[7] + ldr lr, [r2, #28] + adc r9, r0, #0 + umlal r8, r9, r12, lr + # A[7] * B[0] + ldr r12, [r1, #28] + ldr lr, [r2] + mov r11, #0 + umlal r10, r11, r12, lr + str r10, [sp, #28] + adds r3, r3, r11 + # A[7] * B[1] + ldr lr, [r2, #4] + adc r11, r0, #0 + umlal r3, r11, r12, lr + adds r4, r4, r11 + # A[7] * B[2] + ldr lr, [r2, #8] + adc r11, r0, #0 + umlal r4, r11, r12, lr + adds r5, r5, r11 + # A[7] * B[3] + ldr lr, [r2, #12] + adc r11, r0, #0 + umlal r5, r11, r12, lr + adds r6, r6, r11 + # A[7] * B[4] + ldr lr, [r2, #16] + adc r11, r0, #0 + umlal r6, r11, r12, lr + adds r7, r7, r11 + # A[7] * B[5] + ldr lr, [r2, #20] + adc r11, r0, #0 + umlal r7, r11, r12, lr + adds r8, r8, r11 + # A[7] * B[6] + ldr lr, [r2, #24] + adc r11, r0, #0 + umlal r8, r11, r12, lr + adds r9, r9, r11 + # A[7] * B[7] + ldr lr, [r2, #28] + adc r10, r0, #0 + umlal r9, r10, r12, lr + add lr, sp, #32 + stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} + mov r0, sp + # Add c to a * b + ldr lr, [sp, #76] + ldm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm lr!, {r1, r10, r11, r12} + adds r2, r2, r1 + adcs r3, r3, r10 + adcs r4, r4, r11 + adcs r5, r5, r12 + ldm lr!, {r1, r10, r11, r12} + adcs r6, r6, r1 + adcs r7, r7, r10 + adcs r8, r8, r11 + adcs r9, r9, r12 + mov r1, r9 + stm r0!, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm r0, {r2, r3, r4, r5, r6, r7, r8, r9} + adcs r2, r2, #0 + adcs r3, r3, #0 + adcs r4, r4, #0 + adcs r5, r5, #0 + adcs r6, r6, #0 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + sub r0, r0, #32 + # Get 252..503 and 504..507 + lsr lr, r9, #24 + lsl r9, r9, #4 + orr r9, r9, r8, LSR #28 + lsl r8, r8, #4 + orr r8, r8, r7, LSR #28 + lsl r7, r7, #4 + orr r7, r7, r6, LSR #28 + lsl r6, r6, #4 + orr r6, r6, r5, LSR #28 + lsl r5, r5, #4 + orr r5, r5, r4, LSR #28 + lsl r4, r4, #4 + orr r4, r4, r3, LSR #28 + lsl r3, r3, #4 + orr r3, r3, r2, LSR #28 + lsl r2, r2, #4 + orr r2, r2, r1, LSR #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + # Add order times bits 504..507 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xa3 + lsl r10, r10, #8 + orr r10, r10, #10 + lsl r10, r10, #8 + orr r10, r10, #44 + lsl r10, r10, #8 + orr r10, r10, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x2c + lsl r10, r10, #8 + add r10, r10, #0x13 +#else + mov r10, #0x2c13 +#endif + movt r10, #0xa30a +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xa7 + lsl r11, r11, #8 + orr r11, r11, #0xed + lsl r11, r11, #8 + orr r11, r11, #0x9c + lsl r11, r11, #8 + orr r11, r11, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x9c + lsl r11, r11, #8 + add r11, r11, #0xe5 +#else + mov r11, #0x9ce5 +#endif + movt r11, #0xa7ed +#endif + mov r1, #0 + umlal r2, r1, r10, lr + adds r3, r3, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r3, r1, r11, lr +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5d + lsl r10, r10, #8 + orr r10, r10, #8 + lsl r10, r10, #8 + orr r10, r10, #0x63 + lsl r10, r10, #8 + orr r10, r10, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x63 + lsl r10, r10, #8 + add r10, r10, #0x29 +#else + mov r10, #0x6329 +#endif + movt r10, #0x5d08 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xeb + lsl r11, r11, #8 + orr r11, r11, #33 + lsl r11, r11, #8 + orr r11, r11, #6 + lsl r11, r11, #8 + orr r11, r11, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x6 + lsl r11, r11, #8 + add r11, r11, #0x21 +#else + mov r11, #0x621 +#endif + movt r11, #0xeb21 +#endif + adds r4, r4, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r4, r1, r10, lr + adds r5, r5, r1 + mov r1, #0 + adc r1, r1, #0 + umlal r5, r1, r11, lr + adds r6, r6, r1 + adcs r7, r7, #0 + adcs r8, r8, #0 + adc r9, r9, #0 + subs r6, r6, lr + sbcs r7, r7, #0 + sbcs r8, r8, #0 + sbc r9, r9, #0 + # Sub product of top 8 words and order + mov r12, sp +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a +#endif + mov lr, #0 + ldm r0!, {r10, r11} + umlal r10, lr, r2, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0xf0000000 +#else + bfc r11, #28, #4 +#endif + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r0, r0, #16 + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 +#endif + mov lr, #0 + ldm r12, {r10, r11} + umlal r10, lr, r2, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r3, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r4, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r6, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adds r10, r10, lr + mov lr, #0 + adc lr, lr, #0 + umlal r10, lr, r8, r1 + adds r11, r11, lr + mov lr, #0 + adc lr, lr, #0 + umlal r11, lr, r9, r1 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 + # Subtract at 4 * 32 + ldm r12, {r10, r11} + subs r10, r10, r2 + sbcs r11, r11, r3 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r4 + sbcs r11, r11, r5 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r6 + sbcs r11, r11, r7 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r8 + sbc r11, r11, r9 + stm r12!, {r10, r11} + sub r12, r12, #36 + asr lr, r11, #25 + # Conditionally subtract order starting at bit 125 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa00000 + lsl r1, r1, #8 + add r1, r1, #0x0 +#else + mov r1, #0xa0000000 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0x4b + lsl r2, r2, #8 + orr r2, r2, #0x9e + lsl r2, r2, #8 + orr r2, r2, #0xba + lsl r2, r2, #8 + orr r2, r2, #0x7d +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0xba + lsl r2, r2, #8 + add r2, r2, #0x7d +#else + mov r2, #0xba7d +#endif + movt r2, #0x4b9e +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0xcb + lsl r3, r3, #8 + orr r3, r3, #2 + lsl r3, r3, #8 + orr r3, r3, #0x4c + lsl r3, r3, #8 + orr r3, r3, #0x63 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0x4c + lsl r3, r3, #8 + add r3, r3, #0x63 +#else + mov r3, #0x4c63 +#endif + movt r3, #0xcb02 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xd4 + lsl r4, r4, #8 + orr r4, r4, #0x5e + lsl r4, r4, #8 + orr r4, r4, #0xf3 + lsl r4, r4, #8 + orr r4, r4, #0x9a +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xf3 + lsl r4, r4, #8 + add r4, r4, #0x9a +#else + mov r4, #0xf39a +#endif + movt r4, #0xd45e +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #2 + lsl r5, r5, #8 + orr r5, r5, #0x9b + lsl r5, r5, #8 + orr r5, r5, #0xdf + lsl r5, r5, #8 + orr r5, r5, #59 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #0xdf + lsl r5, r5, #8 + add r5, r5, #0x3b +#else + mov r5, #0xdf3b +#endif + movt r5, #0x29b +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r9, #0x20000 + lsl r9, r9, #8 + add r9, r9, #0x0 +#else + mov r9, #0x2000000 +#endif + and r1, r1, lr + and r2, r2, lr + and r3, r3, lr + and r4, r4, lr + and r5, r5, lr + and r9, r9, lr + ldm r12, {r10, r11} + adds r10, r10, r1 + adcs r11, r11, r2 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r3 + adcs r11, r11, r4 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r5 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, #0 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10} + adcs r10, r10, #0 + stm r12!, {r10} + sub r0, r0, #16 + mov r12, sp + # Load bits 252-376 + add r12, r12, #28 + ldm r12, {r1, r2, r3, r4, r5} + lsl r5, r5, #4 + orr r5, r5, r4, lsr #28 + lsl r4, r4, #4 + orr r4, r4, r3, lsr #28 + lsl r3, r3, #4 + orr r3, r3, r2, lsr #28 + lsl r2, r2, #4 + orr r2, r2, r1, lsr #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r5, r5, #0xe0000000 +#else + bfc r5, #29, #3 +#endif + sub r12, r12, #28 + # Sub product of top 4 words and order + mov r0, sp + # * -5cf5d3ed +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x2c + lsl r1, r1, #8 + add r1, r1, #0x13 +#else + mov r1, #0x2c13 +#endif + movt r1, #0xa30a +#endif + mov lr, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, lr, r2, r1 + adds r7, r7, lr + mov lr, #0 + adc lr, lr, #0 + umlal r7, lr, r3, r1 + adds r8, r8, lr + mov lr, #0 + adc lr, lr, #0 + umlal r8, lr, r4, r1 + adds r9, r9, lr + mov lr, #0 + adc lr, lr, #0 + umlal r9, lr, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -5812631b +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x9c + lsl r1, r1, #8 + add r1, r1, #0xe5 +#else + mov r1, #0x9ce5 +#endif + movt r1, #0xa7ed +#endif + mov r10, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r10, r2, r1 + adds r7, r7, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r7, r10, r3, r1 + adds r8, r8, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r8, r10, r4, r1 + adds r9, r9, r10 + mov r10, #0 + adc r10, r10, #0 + umlal r9, r10, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -a2f79cd7 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x63 + lsl r1, r1, #8 + add r1, r1, #0x29 +#else + mov r1, #0x6329 +#endif + movt r1, #0x5d08 +#endif + mov r11, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r11, r2, r1 + adds r7, r7, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r7, r11, r3, r1 + adds r8, r8, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r8, r11, r4, r1 + adds r9, r9, r11 + mov r11, #0 + adc r11, r11, #0 + umlal r9, r11, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # * -14def9df +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x6 + lsl r1, r1, #8 + add r1, r1, #0x21 +#else + mov r1, #0x621 +#endif + movt r1, #0xeb21 +#endif + mov r12, #0 + ldm r0, {r6, r7, r8, r9} + umlal r6, r12, r2, r1 + adds r7, r7, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r7, r12, r3, r1 + adds r8, r8, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r8, r12, r4, r1 + adds r9, r9, r12 + mov r12, #0 + adc r12, r12, #0 + umlal r9, r12, r5, r1 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 + # Add overflows at 4 * 32 + ldm r0, {r6, r7, r8, r9} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + adds r6, r6, lr + adcs r7, r7, r10 + adcs r8, r8, r11 + adc r9, r9, r12 + # Subtract top at 4 * 32 + subs r6, r6, r2 + sbcs r7, r7, r3 + sbcs r8, r8, r4 + sbcs r9, r9, r5 + sbc r1, r1, r1 + sub r0, r0, #16 + ldm r0, {r2, r3, r4, r5} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5c + lsl r10, r10, #8 + orr r10, r10, #0xf5 + lsl r10, r10, #8 + orr r10, r10, #0xd3 + lsl r10, r10, #8 + orr r10, r10, #0xed +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xd3 + lsl r10, r10, #8 + add r10, r10, #0xed +#else + mov r10, #0xd3ed +#endif + movt r10, #0x5cf5 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x58 + lsl r11, r11, #8 + orr r11, r11, #18 + lsl r11, r11, #8 + orr r11, r11, #0x63 + lsl r11, r11, #8 + orr r11, r11, #26 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x63 + lsl r11, r11, #8 + add r11, r11, #0x1a +#else + mov r11, #0x631a +#endif + movt r11, #0x5812 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0xa2 + lsl r12, r12, #8 + orr r12, r12, #0xf7 + lsl r12, r12, #8 + orr r12, r12, #0x9c + lsl r12, r12, #8 + orr r12, r12, #0xd6 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0x9c + lsl r12, r12, #8 + add r12, r12, #0xd6 +#else + mov r12, #0x9cd6 +#endif + movt r12, #0xa2f7 +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #20 + lsl lr, lr, #8 + orr lr, lr, #0xde + lsl lr, lr, #8 + orr lr, lr, #0xf9 + lsl lr, lr, #8 + orr lr, lr, #0xde +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #0xf9 + lsl lr, lr, #8 + add lr, lr, #0xde +#else + mov lr, #0xf9de +#endif + movt lr, #0x14de +#endif + and r10, r10, r1 + and r11, r11, r1 + and r12, r12, r1 + and lr, lr, r1 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, lr + adcs r6, r6, #0 + adcs r7, r7, #0 + and r1, r1, #0x10000000 + adcs r8, r8, #0 + adc r9, r9, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif + ldr r0, [sp, #68] + # Store result + str r2, [r0] + str r3, [r0, #4] + str r4, [r0, #8] + str r5, [r0, #12] + str r6, [r0, #16] + str r7, [r0, #20] + str r8, [r0, #24] + str r9, [r0, #28] + add sp, sp, #0x50 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size sc_muladd,.-sc_muladd +#else + .text + .align 4 + .globl sc_muladd + .type sc_muladd, %function +sc_muladd: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #0x50 add lr, sp, #0x44 stm lr, {r0, r1, r3} mov lr, r2 @@ -4777,10 +8096,10 @@ sc_muladd: mov r3, r12 add lr, sp, #32 stm lr, {r3, r4, r5, r6, r7, r8, r9, r10} - add r0, sp, #0x50 + mov r0, sp # Add c to a * b ldr lr, [sp, #76] - ldm sp!, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm r0, {r2, r3, r4, r5, r6, r7, r8, r9} ldm lr!, {r1, r10, r11, r12} adds r2, r2, r1 adcs r3, r3, r10 @@ -4792,8 +8111,8 @@ sc_muladd: adcs r8, r8, r11 adcs r9, r9, r12 mov r1, r9 - stm r0, {r2, r3, r4, r5, r6, r7, r8, r9} - ldm sp, {r2, r3, r4, r5, r6, r7, r8, r9} + stm r0!, {r2, r3, r4, r5, r6, r7, r8, r9} + ldm r0, {r2, r3, r4, r5, r6, r7, r8, r9} adcs r2, r2, #0 adcs r3, r3, #0 adcs r4, r4, #0 @@ -4802,28 +8121,41 @@ sc_muladd: adcs r7, r7, #0 adcs r8, r8, #0 adc r9, r9, #0 - sub sp, sp, #32 + sub r0, r0, #32 # Get 252..503 and 504..507 lsr lr, r9, #24 - bfc r9, #24, #8 lsl r9, r9, #4 - orr r9, r9, r8, lsr #28 + orr r9, r9, r8, LSR #28 lsl r8, r8, #4 - orr r8, r8, r7, lsr #28 + orr r8, r8, r7, LSR #28 lsl r7, r7, #4 - orr r7, r7, r6, lsr #28 + orr r7, r7, r6, LSR #28 lsl r6, r6, #4 - orr r6, r6, r5, lsr #28 + orr r6, r6, r5, LSR #28 lsl r5, r5, #4 - orr r5, r5, r4, lsr #28 + orr r5, r5, r4, LSR #28 lsl r4, r4, #4 - orr r4, r4, r3, lsr #28 + orr r4, r4, r3, LSR #28 lsl r3, r3, #4 - orr r3, r3, r2, lsr #28 + orr r3, r3, r2, LSR #28 lsl r2, r2, #4 - orr r2, r2, r1, lsr #28 + orr r2, r2, r1, LSR #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else + bfc r9, #28, #4 +#endif # Add order times bits 504..507 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0xa3 + lsl r10, r10, #8 + orr r10, r10, #10 + lsl r10, r10, #8 + orr r10, r10, #44 + lsl r10, r10, #8 + orr r10, r10, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0x2c lsl r10, r10, #8 add r10, r10, #0x13 @@ -4831,7 +8163,17 @@ sc_muladd: mov r10, #0x2c13 #endif movt r10, #0xa30a -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xa7 + lsl r11, r11, #8 + orr r11, r11, #0xed + lsl r11, r11, #8 + orr r11, r11, #0x9c + lsl r11, r11, #8 + orr r11, r11, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x9c lsl r11, r11, #8 add r11, r11, #0xe5 @@ -4839,10 +8181,20 @@ sc_muladd: mov r11, #0x9ce5 #endif movt r11, #0xa7ed +#endif mov r1, #0 umlal r2, r1, r10, lr umaal r3, r1, r11, lr -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5d + lsl r10, r10, #8 + orr r10, r10, #8 + lsl r10, r10, #8 + orr r10, r10, #0x63 + lsl r10, r10, #8 + orr r10, r10, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0x63 lsl r10, r10, #8 add r10, r10, #0x29 @@ -4850,7 +8202,17 @@ sc_muladd: mov r10, #0x6329 #endif movt r10, #0x5d08 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0xeb + lsl r11, r11, #8 + orr r11, r11, #33 + lsl r11, r11, #8 + orr r11, r11, #6 + lsl r11, r11, #8 + orr r11, r11, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x6 lsl r11, r11, #8 add r11, r11, #0x21 @@ -4858,6 +8220,7 @@ sc_muladd: mov r11, #0x621 #endif movt r11, #0xeb21 +#endif umaal r4, r1, r10, lr umaal r5, r1, r11, lr adds r6, r6, r1 @@ -4869,7 +8232,17 @@ sc_muladd: sbcs r8, r8, #0 sbc r9, r9, #0 # Sub product of top 8 words and order -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + mov r12, sp +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x2c lsl r1, r1, #8 add r1, r1, #0x13 @@ -4877,25 +8250,41 @@ sc_muladd: mov r1, #0x2c13 #endif movt r1, #0xa30a +#endif mov lr, #0 - ldm r0!, {r10, r11, r12} + ldm r0!, {r10, r11} umlal r10, lr, r2, r1 umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm r0!, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r0!, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} ldm r0!, {r10, r11} umaal r10, lr, r8, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r11, r11, #0xf0000000 +#else bfc r11, #28, #4 +#endif umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} + stm r12!, {r10, r11, lr} sub r0, r0, #16 - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x9c lsl r1, r1, #8 add r1, r1, #0xe5 @@ -4903,23 +8292,35 @@ sc_muladd: mov r1, #0x9ce5 #endif movt r1, #0xa7ed +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} umaal r10, lr, r8, r1 umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x63 lsl r1, r1, #8 add r1, r1, #0x29 @@ -4927,23 +8328,35 @@ sc_muladd: mov r1, #0x6329 #endif movt r1, #0x5d08 +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} umaal r10, lr, r8, r1 umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + stm r12!, {r10, r11, lr} + sub r12, r12, #32 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x6 lsl r1, r1, #8 add r1, r1, #0x21 @@ -4951,48 +8364,62 @@ sc_muladd: mov r1, #0x621 #endif movt r1, #0xeb21 +#endif mov lr, #0 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} umlal r10, lr, r2, r1 umaal r11, lr, r3, r1 - umaal r12, lr, r4, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - umaal r10, lr, r5, r1 - umaal r11, lr, r6, r1 - umaal r12, lr, r7, r1 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r4, r1 + umaal r11, lr, r5, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + umaal r10, lr, r6, r1 + umaal r11, lr, r7, r1 + stm r12!, {r10, r11} + ldm r12, {r10, r11} umaal r10, lr, r8, r1 umaal r11, lr, r9, r1 - stm sp!, {r10, r11, lr} - sub sp, sp, #32 + stm r12!, {r10, r11, lr} + sub r12, r12, #32 # Subtract at 4 * 32 - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} subs r10, r10, r2 sbcs r11, r11, r3 - sbcs r12, r12, r4 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - sbcs r10, r10, r5 - sbcs r11, r11, r6 - sbcs r12, r12, r7 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r4 + sbcs r11, r11, r5 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + sbcs r10, r10, r6 + sbcs r11, r11, r7 + stm r12!, {r10, r11} + ldm r12, {r10, r11} sbcs r10, r10, r8 sbc r11, r11, r9 - stm sp!, {r10, r11} - sub sp, sp, #36 + stm r12!, {r10, r11} + sub r12, r12, #36 asr lr, r11, #25 # Conditionally subtract order starting at bit 125 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0xa00000 lsl r1, r1, #8 add r1, r1, #0x0 #else mov r1, #0xa0000000 #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r2, #0x4b + lsl r2, r2, #8 + orr r2, r2, #0x9e + lsl r2, r2, #8 + orr r2, r2, #0xba + lsl r2, r2, #8 + orr r2, r2, #0x7d +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r2, #0xba lsl r2, r2, #8 add r2, r2, #0x7d @@ -5000,7 +8427,17 @@ sc_muladd: mov r2, #0xba7d #endif movt r2, #0x4b9e -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r3, #0xcb + lsl r3, r3, #8 + orr r3, r3, #2 + lsl r3, r3, #8 + orr r3, r3, #0x4c + lsl r3, r3, #8 + orr r3, r3, #0x63 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r3, #0x4c lsl r3, r3, #8 add r3, r3, #0x63 @@ -5008,7 +8445,17 @@ sc_muladd: mov r3, #0x4c63 #endif movt r3, #0xcb02 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r4, #0xd4 + lsl r4, r4, #8 + orr r4, r4, #0x5e + lsl r4, r4, #8 + orr r4, r4, #0xf3 + lsl r4, r4, #8 + orr r4, r4, #0x9a +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r4, #0xf3 lsl r4, r4, #8 add r4, r4, #0x9a @@ -5016,7 +8463,17 @@ sc_muladd: mov r4, #0xf39a #endif movt r4, #0xd45e -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r5, #2 + lsl r5, r5, #8 + orr r5, r5, #0x9b + lsl r5, r5, #8 + orr r5, r5, #0xdf + lsl r5, r5, #8 + orr r5, r5, #59 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r5, #0xdf lsl r5, r5, #8 add r5, r5, #0x3b @@ -5024,7 +8481,8 @@ sc_muladd: mov r5, #0xdf3b #endif movt r5, #0x29b -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r9, #0x20000 lsl r9, r9, #8 add r9, r9, #0x0 @@ -5037,26 +8495,30 @@ sc_muladd: and r4, r4, lr and r5, r5, lr and r9, r9, lr - ldm sp, {r10, r11, r12} + ldm r12, {r10, r11} adds r10, r10, r1 adcs r11, r11, r2 - adcs r12, r12, r3 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} - adcs r10, r10, r4 - adcs r11, r11, r5 - adcs r12, r12, #0 - stm sp!, {r10, r11, r12} - ldm sp, {r10, r11, r12} + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r3 + adcs r11, r11, r4 + stm r12!, {r10, r11} + ldm r12, {r10, r11} + adcs r10, r10, r5 + adcs r11, r11, #0 + stm r12!, {r10, r11} + ldm r12, {r10, r11} adcs r10, r10, #0 adcs r11, r11, #0 - adcs r12, r12, r9 - stm sp!, {r10, r11, r12} - sub sp, sp, #48 + stm r12!, {r10, r11} + ldm r12, {r10} + adcs r10, r10, #0 + stm r12!, {r10} sub r0, r0, #16 + mov r12, sp # Load bits 252-376 - add sp, sp, #28 - ldm sp, {r1, r2, r3, r4, r5} + add r12, r12, #28 + ldm r12, {r1, r2, r3, r4, r5} lsl r5, r5, #4 orr r5, r5, r4, lsr #28 lsl r4, r4, #4 @@ -5065,11 +8527,25 @@ sc_muladd: orr r3, r3, r2, lsr #28 lsl r2, r2, #4 orr r2, r2, r1, lsr #28 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r5, r5, #0xe0000000 +#else bfc r5, #29, #3 - sub sp, sp, #28 - # Sub product of top 8 words and order +#endif + sub r12, r12, #28 + # Sub product of top 4 words and order + mov r0, sp # * -5cf5d3ed -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa3 + lsl r1, r1, #8 + orr r1, r1, #10 + lsl r1, r1, #8 + orr r1, r1, #44 + lsl r1, r1, #8 + orr r1, r1, #19 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x2c lsl r1, r1, #8 add r1, r1, #0x13 @@ -5077,16 +8553,26 @@ sc_muladd: mov r1, #0x2c13 #endif movt r1, #0xa30a +#endif mov lr, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, lr, r2, r1 umaal r7, lr, r3, r1 umaal r8, lr, r4, r1 umaal r9, lr, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -5812631b -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xa7 + lsl r1, r1, #8 + orr r1, r1, #0xed + lsl r1, r1, #8 + orr r1, r1, #0x9c + lsl r1, r1, #8 + orr r1, r1, #0xe5 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x9c lsl r1, r1, #8 add r1, r1, #0xe5 @@ -5094,16 +8580,26 @@ sc_muladd: mov r1, #0x9ce5 #endif movt r1, #0xa7ed +#endif mov r10, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r10, r2, r1 umaal r7, r10, r3, r1 umaal r8, r10, r4, r1 umaal r9, r10, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -a2f79cd7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0x5d + lsl r1, r1, #8 + orr r1, r1, #8 + lsl r1, r1, #8 + orr r1, r1, #0x63 + lsl r1, r1, #8 + orr r1, r1, #41 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x63 lsl r1, r1, #8 add r1, r1, #0x29 @@ -5111,16 +8607,26 @@ sc_muladd: mov r1, #0x6329 #endif movt r1, #0x5d08 +#endif mov r11, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r11, r2, r1 umaal r7, r11, r3, r1 umaal r8, r11, r4, r1 umaal r9, r11, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # * -14def9df -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r1, #0xeb + lsl r1, r1, #8 + orr r1, r1, #33 + lsl r1, r1, #8 + orr r1, r1, #6 + lsl r1, r1, #8 + orr r1, r1, #33 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r1, #0x6 lsl r1, r1, #8 add r1, r1, #0x21 @@ -5128,17 +8634,22 @@ sc_muladd: mov r1, #0x621 #endif movt r1, #0xeb21 +#endif mov r12, #0 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} umlal r6, r12, r2, r1 umaal r7, r12, r3, r1 umaal r8, r12, r4, r1 umaal r9, r12, r5, r1 - stm sp, {r6, r7, r8, r9} - add sp, sp, #4 + stm r0, {r6, r7, r8, r9} + add r0, r0, #4 # Add overflows at 4 * 32 - ldm sp, {r6, r7, r8, r9} + ldm r0, {r6, r7, r8, r9} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else bfc r9, #28, #4 +#endif adds r6, r6, lr adcs r7, r7, r10 adcs r8, r8, r11 @@ -5149,9 +8660,18 @@ sc_muladd: sbcs r8, r8, r4 sbcs r9, r9, r5 sbc r1, r1, r1 - sub sp, sp, #16 - ldm sp, {r2, r3, r4, r5} -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + sub r0, r0, #16 + ldm r0, {r2, r3, r4, r5} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r10, #0x5c + lsl r10, r10, #8 + orr r10, r10, #0xf5 + lsl r10, r10, #8 + orr r10, r10, #0xd3 + lsl r10, r10, #8 + orr r10, r10, #0xed +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r10, #0xd3 lsl r10, r10, #8 add r10, r10, #0xed @@ -5159,7 +8679,17 @@ sc_muladd: mov r10, #0xd3ed #endif movt r10, #0x5cf5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r11, #0x58 + lsl r11, r11, #8 + orr r11, r11, #18 + lsl r11, r11, #8 + orr r11, r11, #0x63 + lsl r11, r11, #8 + orr r11, r11, #26 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r11, #0x63 lsl r11, r11, #8 add r11, r11, #0x1a @@ -5167,7 +8697,17 @@ sc_muladd: mov r11, #0x631a #endif movt r11, #0x5812 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov r12, #0xa2 + lsl r12, r12, #8 + orr r12, r12, #0xf7 + lsl r12, r12, #8 + orr r12, r12, #0x9c + lsl r12, r12, #8 + orr r12, r12, #0xd6 +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov r12, #0x9c lsl r12, r12, #8 add r12, r12, #0xd6 @@ -5175,7 +8715,17 @@ sc_muladd: mov r12, #0x9cd6 #endif movt r12, #0xa2f7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + mov lr, #20 + lsl lr, lr, #8 + orr lr, lr, #0xde + lsl lr, lr, #8 + orr lr, lr, #0xf9 + lsl lr, lr, #8 + orr lr, lr, #0xde +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) mov lr, #0xf9 lsl lr, lr, #8 add lr, lr, #0xde @@ -5183,6 +8733,7 @@ sc_muladd: mov lr, #0xf9de #endif movt lr, #0x14de +#endif and r10, r10, r1 and r11, r11, r1 and r12, r12, r1 @@ -5196,7 +8747,11 @@ sc_muladd: and r1, r1, #0x10000000 adcs r8, r8, #0 adc r9, r9, r1 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + bic r9, r9, #0xf0000000 +#else bfc r9, #28, #4 +#endif ldr r0, [sp, #68] # Store result str r2, [r0] @@ -5207,9 +8762,10 @@ sc_muladd: str r7, [r0, #20] str r8, [r0, #24] str r9, [r0, #28] - add sp, sp, #0x70 + add sp, sp, #0x50 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size sc_muladd,.-sc_muladd +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ diff --git a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c index 1de8c9c77..eacdd1c92 100644 --- a/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-curve25519_c.c @@ -39,6 +39,18 @@ #include #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ /* Based on work by: Emil Lenngren * https://github.com/pornin/X25519-Cortex-M4 */ @@ -65,13 +77,13 @@ void fe_add_sub_op() { __asm__ __volatile__ ( /* Add-Sub */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r2]\n\t" "ldr r5, [r2, #4]\n\t" #else "ldrd r4, r5, [r2]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3]\n\t" "ldr r7, [r3, #4]\n\t" #else @@ -82,7 +94,7 @@ void fe_add_sub_op() "mov r12, #0\n\t" "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r0]\n\t" "str r9, [r0, #4]\n\t" #else @@ -91,19 +103,19 @@ void fe_add_sub_op() /* Sub */ "subs r10, r4, r6\n\t" "sbcs r11, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [r1]\n\t" "str r11, [r1, #4]\n\t" #else "strd r10, r11, [r1]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r2, #8]\n\t" "ldr r5, [r2, #12]\n\t" #else "ldrd r4, r5, [r2, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #8]\n\t" "ldr r7, [r3, #12]\n\t" #else @@ -114,7 +126,7 @@ void fe_add_sub_op() "mov lr, #0\n\t" "sbcs r11, r5, r7\n\t" "adc lr, lr, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [r1, #8]\n\t" "str r11, [r1, #12]\n\t" #else @@ -124,19 +136,19 @@ void fe_add_sub_op() "subs r12, r12, #1\n\t" "adcs r8, r4, r6\n\t" "adcs r9, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r0, #8]\n\t" "str r9, [r0, #12]\n\t" #else "strd r8, r9, [r0, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r2, #16]\n\t" "ldr r5, [r2, #20]\n\t" #else "ldrd r4, r5, [r2, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #16]\n\t" "ldr r7, [r3, #20]\n\t" #else @@ -147,7 +159,7 @@ void fe_add_sub_op() "mov r12, #0\n\t" "adcs r9, r5, r7\n\t" "adc r12, r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r0, #16]\n\t" "str r9, [r0, #20]\n\t" #else @@ -157,19 +169,19 @@ void fe_add_sub_op() "subs lr, lr, #1\n\t" "sbcs r10, r4, r6\n\t" "sbcs r11, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [r1, #16]\n\t" "str r11, [r1, #20]\n\t" #else "strd r10, r11, [r1, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r2, #24]\n\t" "ldr r5, [r2, #28]\n\t" #else "ldrd r4, r5, [r2, #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #24]\n\t" "ldr r7, [r3, #28]\n\t" #else @@ -191,13 +203,13 @@ void fe_add_sub_op() "orr r3, r3, r9, lsr #31\n\t" "mul r12, r3, r12\n\t" /* Add -x*modulus (if overflow) */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r0]\n\t" "ldr r5, [r0, #4]\n\t" #else "ldrd r4, r5, [r0]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r0, #8]\n\t" "ldr r7, [r0, #12]\n\t" #else @@ -207,19 +219,19 @@ void fe_add_sub_op() "adcs r5, r5, #0\n\t" "adcs r6, r6, #0\n\t" "adcs r7, r7, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [r0]\n\t" "str r5, [r0, #4]\n\t" #else "strd r4, r5, [r0]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [r0, #8]\n\t" "str r7, [r0, #12]\n\t" #else "strd r6, r7, [r0, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [r0, #16]\n\t" "ldr r5, [r0, #20]\n\t" #else @@ -227,16 +239,20 @@ void fe_add_sub_op() #endif "adcs r4, r4, #0\n\t" "adcs r5, r5, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [r0, #16]\n\t" "str r5, [r0, #20]\n\t" #else "strd r4, r5, [r0, #16]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else "bfc r9, #31, #1\n\t" +#endif "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r0, #24]\n\t" "str r9, [r0, #28]\n\t" #else @@ -255,7 +271,11 @@ void fe_add_sub_op() "sbcs r7, r7, #0\n\t" "sbcs r8, r8, #0\n\t" "sbcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "sbcs r10, r10, #0\n\t" "sbc r11, r11, #0\n\t" "stm r1, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -293,7 +313,11 @@ void fe_sub_op() "sbcs r9, r9, #0\n\t" "sbcs r10, r10, #0\n\t" "sbcs r11, r11, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic lr, lr, #0x80000000\n\t" +#else "bfc lr, #31, #1\n\t" +#endif "sbcs r12, r12, #0\n\t" "sbc lr, lr, #0\n\t" "stm r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" @@ -346,7 +370,11 @@ void fe_add_op() "adcs r9, r9, #0\n\t" "adcs r10, r10, #0\n\t" "adcs r11, r11, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic lr, lr, #0x80000000\n\t" +#else "bfc lr, #31, #1\n\t" +#endif "adcs r12, r12, #0\n\t" "adc lr, lr, #0\n\t" "stm r0, {r6, r7, r8, r9, r10, r11, r12, lr}\n\t" @@ -386,7 +414,11 @@ void fe_frombytes(fe out_p, const unsigned char* in_p) "ldr r7, [%[in], #20]\n\t" "ldr r8, [%[in], #24]\n\t" "ldr r9, [%[in], #28]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else "bfc r9, #31, #1\n\t" +#endif "str r2, [%[out]]\n\t" "str r3, [%[out], #4]\n\t" "str r4, [%[out], #8]\n\t" @@ -426,7 +458,11 @@ void fe_tobytes(unsigned char* out_p, const fe n_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else "bfc r9, #31, #1\n\t" +#endif "str r2, [%[out]]\n\t" "str r3, [%[out], #4]\n\t" "str r4, [%[out], #8]\n\t" @@ -490,49 +526,49 @@ void fe_copy(fe r_p, const fe a_p) __asm__ __volatile__ ( /* Copy */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r2, [%[a]]\n\t" "ldr r3, [%[a], #4]\n\t" #else "ldrd r2, r3, [%[a]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[a], #12]\n\t" #else "ldrd r4, r5, [%[a], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r2, [%[r]]\n\t" "str r3, [%[r], #4]\n\t" #else "strd r2, r3, [%[r]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #8]\n\t" "str r5, [%[r], #12]\n\t" #else "strd r4, r5, [%[r], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r2, [%[a], #16]\n\t" "ldr r3, [%[a], #20]\n\t" #else "ldrd r2, r3, [%[a], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[a], #28]\n\t" #else "ldrd r4, r5, [%[a], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r2, [%[r], #16]\n\t" "str r3, [%[r], #20]\n\t" #else "strd r2, r3, [%[r], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #24]\n\t" "str r5, [%[r], #28]\n\t" #else @@ -595,7 +631,11 @@ int fe_isnonzero(const fe a_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else "bfc r9, #31, #1\n\t" +#endif "orr r2, r2, r3\n\t" "orr r4, r4, r5\n\t" "orr r6, r6, r7\n\t" @@ -645,8 +685,18 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) register signed char b asm ("r2") = (signed char)b_p; __asm__ __volatile__ ( +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl %[b], %[b], #24\n\t" + "asr %[b], %[b], #24\n\t" +#else "sxtb %[b], %[b]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r3, %[b], #24\n\t" + "asr r3, %[b], #31\n\t" +#else "sbfx r3, %[b], #7, #1\n\t" +#endif "eor r12, %[b], r3\n\t" "sub r12, r12, r3\n\t" "mov r4, #1\n\t" @@ -655,7 +705,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "mov r7, #0\n\t" "mov r8, #0\n\t" "mov r9, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -665,7 +715,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #31\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -677,7 +727,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -689,7 +739,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -702,7 +752,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -712,7 +762,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #30\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -724,7 +774,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -736,7 +786,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -749,7 +799,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -759,7 +809,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #29\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -771,7 +821,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -783,7 +833,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -796,7 +846,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -806,7 +856,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #28\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -818,7 +868,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -830,7 +880,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -843,7 +893,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -853,7 +903,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #27\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -865,7 +915,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -877,7 +927,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -890,7 +940,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -900,7 +950,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #26\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -912,7 +962,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -924,7 +974,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -937,7 +987,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -947,7 +997,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #25\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -959,7 +1009,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -971,7 +1021,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -984,7 +1034,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -994,7 +1044,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #24\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base]]\n\t" "ldr r11, [%[base], #4]\n\t" #else @@ -1006,7 +1056,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #32]\n\t" "ldr r11, [%[base], #36]\n\t" #else @@ -1018,7 +1068,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #64]\n\t" "ldr r11, [%[base], #68]\n\t" #else @@ -1051,25 +1101,30 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r11, r11, r9\n\t" "and r11, r11, r12\n\t" "eor r9, r9, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r]]\n\t" "str r5, [%[r], #4]\n\t" #else "strd r4, r5, [%[r]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r], #32]\n\t" "str r7, [%[r], #36]\n\t" #else "strd r6, r7, [%[r], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #64]\n\t" "str r9, [%[r], #68]\n\t" #else "strd r8, r9, [%[r], #64]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r3, %[b], #24\n\t" + "asr r3, %[b], #31\n\t" +#else "sbfx r3, %[b], #7, #1\n\t" +#endif "eor r12, %[b], r3\n\t" "sub r12, r12, r3\n\t" "mov r4, #0\n\t" @@ -1078,7 +1133,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "mov r7, #0\n\t" "mov r8, #0\n\t" "mov r9, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1088,7 +1143,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #31\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1100,7 +1155,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1112,7 +1167,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1125,7 +1180,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1135,7 +1190,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #30\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1147,7 +1202,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1159,7 +1214,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1172,7 +1227,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1182,7 +1237,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #29\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1194,7 +1249,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1206,7 +1261,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1219,7 +1274,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1229,7 +1284,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #28\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1241,7 +1296,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1253,7 +1308,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1266,7 +1321,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1276,7 +1331,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #27\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1288,7 +1343,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1300,7 +1355,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1313,7 +1368,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1323,7 +1378,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #26\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1335,7 +1390,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1347,7 +1402,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1360,7 +1415,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1370,7 +1425,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #25\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1382,7 +1437,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1394,7 +1449,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1407,7 +1462,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1417,7 +1472,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #24\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #8]\n\t" "ldr r11, [%[base], #12]\n\t" #else @@ -1429,7 +1484,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #40]\n\t" "ldr r11, [%[base], #44]\n\t" #else @@ -1441,7 +1496,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #72]\n\t" "ldr r11, [%[base], #76]\n\t" #else @@ -1475,25 +1530,30 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r11, r11, r9\n\t" "and r11, r11, r12\n\t" "eor r9, r9, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #8]\n\t" "str r5, [%[r], #12]\n\t" #else "strd r4, r5, [%[r], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r], #40]\n\t" "str r7, [%[r], #44]\n\t" #else "strd r6, r7, [%[r], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #72]\n\t" "str r9, [%[r], #76]\n\t" #else "strd r8, r9, [%[r], #72]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r3, %[b], #24\n\t" + "asr r3, %[b], #31\n\t" +#else "sbfx r3, %[b], #7, #1\n\t" +#endif "eor r12, %[b], r3\n\t" "sub r12, r12, r3\n\t" "mov r4, #0\n\t" @@ -1502,7 +1562,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "mov r7, #0\n\t" "mov r8, #0\n\t" "mov r9, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1512,7 +1572,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #31\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1524,7 +1584,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1536,7 +1596,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1549,7 +1609,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1559,7 +1619,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #30\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1571,7 +1631,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1583,7 +1643,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1596,7 +1656,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1606,7 +1666,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #29\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1618,7 +1678,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1630,7 +1690,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1643,7 +1703,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1653,7 +1713,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #28\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1665,7 +1725,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1677,7 +1737,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1690,7 +1750,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1700,7 +1760,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #27\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1712,7 +1772,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1724,7 +1784,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1737,7 +1797,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1747,7 +1807,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #26\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1759,7 +1819,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1771,7 +1831,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1784,7 +1844,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1794,7 +1854,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #25\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1806,7 +1866,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1818,7 +1878,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1831,7 +1891,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1841,7 +1901,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #24\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #16]\n\t" "ldr r11, [%[base], #20]\n\t" #else @@ -1853,7 +1913,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #48]\n\t" "ldr r11, [%[base], #52]\n\t" #else @@ -1865,7 +1925,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #80]\n\t" "ldr r11, [%[base], #84]\n\t" #else @@ -1899,25 +1959,30 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r11, r11, r9\n\t" "and r11, r11, r12\n\t" "eor r9, r9, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #16]\n\t" "str r5, [%[r], #20]\n\t" #else "strd r4, r5, [%[r], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r], #48]\n\t" "str r7, [%[r], #52]\n\t" #else "strd r6, r7, [%[r], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #80]\n\t" "str r9, [%[r], #84]\n\t" #else "strd r8, r9, [%[r], #80]\n\t" #endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r3, %[b], #24\n\t" + "asr r3, %[b], #31\n\t" +#else "sbfx r3, %[b], #7, #1\n\t" +#endif "eor r12, %[b], r3\n\t" "sub r12, r12, r3\n\t" "mov r4, #0\n\t" @@ -1926,7 +1991,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "mov r7, #0\n\t" "mov r8, #0\n\t" "mov r9, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1936,7 +2001,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #31\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -1948,7 +2013,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -1960,7 +2025,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -1973,7 +2038,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -1983,7 +2048,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #30\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -1995,7 +2060,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2007,7 +2072,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2020,7 +2085,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2030,7 +2095,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #29\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2042,7 +2107,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2054,7 +2119,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2067,7 +2132,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2077,7 +2142,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #28\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2089,7 +2154,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2101,7 +2166,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2114,7 +2179,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2124,7 +2189,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #27\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2136,7 +2201,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2148,7 +2213,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2161,7 +2226,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2171,7 +2236,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #26\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2183,7 +2248,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2195,7 +2260,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2208,7 +2273,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2218,7 +2283,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #25\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2230,7 +2295,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2242,7 +2307,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2255,7 +2320,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r8, r8, r10\n\t" "eor r9, r9, r11\n\t" "add %[base], %[base], #0x60\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x800000\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x0\n\t" @@ -2265,7 +2330,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "ror r3, r3, #24\n\t" "ror r3, r3, r12\n\t" "asr r3, r3, #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #24]\n\t" "ldr r11, [%[base], #28]\n\t" #else @@ -2277,7 +2342,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r4, r4, r10\n\t" "eor r5, r5, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #56]\n\t" "ldr r11, [%[base], #60]\n\t" #else @@ -2289,7 +2354,7 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "and r11, r11, r3\n\t" "eor r6, r6, r10\n\t" "eor r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[base], #88]\n\t" "ldr r11, [%[base], #92]\n\t" #else @@ -2322,19 +2387,19 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) "eor r11, r11, r9\n\t" "and r11, r11, r12\n\t" "eor r9, r9, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #24]\n\t" "str r5, [%[r], #28]\n\t" #else "strd r4, r5, [%[r], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r], #56]\n\t" "str r7, [%[r], #60]\n\t" #else "strd r6, r7, [%[r], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #88]\n\t" "str r9, [%[r], #92]\n\t" #else @@ -2354,12 +2419,26 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) register signed char b asm ("r2") = (signed char)b_p; __asm__ __volatile__ ( +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "lsl %[b], %[b], #24\n\t" + "asr %[b], %[b], #24\n\t" +#else "sxtb %[b], %[b]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "lsl r3, %[b], #24\n\t" + "asr r3, %[b], #31\n\t" +#else "sbfx r3, %[b], #7, #1\n\t" +#endif "eor %[b], %[b], r3\n\t" "sub %[b], %[b], r3\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "sub lr, %[b], #1\n\t" +#else "clz lr, %[b]\n\t" "lsl lr, lr, #26\n\t" +#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */ "asr lr, lr, #31\n\t" "mvn lr, lr\n\t" "add %[b], %[b], lr\n\t" @@ -2455,12 +2534,397 @@ void fe_cmov_table(fe* r_p, fe* base_p, signed char b_p) #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +void fe_mul_op(void); +void fe_mul_op() +{ + __asm__ __volatile__ ( + "sub sp, sp, #40\n\t" + "str r0, [sp, #36]\n\t" + "mov r0, #0\n\t" + "ldr r12, [r1]\n\t" + /* A[0] * B[0] */ + "ldr lr, [r2]\n\t" + "umull r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "umull r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "umull r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "umull r9, r10, r12, lr\n\t" + "str r3, [sp]\n\t" + /* A[0] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "mov r11, r0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[0] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adcs r6, r6, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[0] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adcs r8, r8, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[0] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adcs r10, r10, #0\n\t" + "adc r3, r0, #0\n\t" + "umlal r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "ldr r12, [r1, #4]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "str r4, [sp, #4]\n\t" + "adds r5, r5, r11\n\t" + /* A[1] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[1] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r4, r0, #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "ldr r12, [r1, #8]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "str r5, [sp, #8]\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[2] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[2] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r5, r0, #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "ldr r12, [r1, #12]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[3] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[3] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[3] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r6, r0, #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "ldr r12, [r1, #16]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[4] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[4] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[4] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[4] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[4] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r7, r0, #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "ldr r12, [r1, #20]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[5] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[5] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[5] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[5] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[5] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r8, r0, #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "ldr r12, [r1, #24]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[6] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[6] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[6] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[6] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[6] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[6] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r9, r0, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "ldr r12, [r1, #28]\n\t" + "ldr lr, [r2]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[7] * B[1] */ + "ldr lr, [r2, #4]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[7] * B[2] */ + "ldr lr, [r2, #8]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[7] * B[3] */ + "ldr lr, [r2, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[7] * B[4] */ + "ldr lr, [r2, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[7] * B[5] */ + "ldr lr, [r2, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * B[6] */ + "ldr lr, [r2, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[7] * B[7] */ + "ldr lr, [r2, #28]\n\t" + "adc r10, r0, #0\n\t" + "umlal r9, r10, r12, lr\n\t" + /* Reduce */ + "ldr r2, [sp, #28]\n\t" + "mov lr, sp\n\t" + "mov r12, #38\n\t" + "umull r10, r11, r12, r10\n\t" + "adds r10, r10, r2\n\t" + "adc r11, r11, #0\n\t" + "mov r12, #19\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, LSR #31\n\t" + "mul r11, r12, r11\n\t" + "ldm lr!, {r1, r2}\n\t" + "mov r12, #38\n\t" + "adds r1, r1, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r1, r11, r3, r12\n\t" + "adds r2, r2, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r2, r11, r4, r12\n\t" + "ldm lr!, {r3, r4}\n\t" + "adds r3, r3, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r5, r12\n\t" + "adds r4, r4, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r6, r12\n\t" + "ldm lr!, {r5, r6}\n\t" + "adds r5, r5, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r7, r12\n\t" + "adds r6, r6, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r8, r12\n\t" + "ldm lr!, {r7, r8}\n\t" + "adds r7, r7, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r9, r12\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r10, r10, #0x80000000\n\t" +#else + "bfc r10, #31, #1\n\t" +#endif + "adds r8, r10, r11\n\t" + /* Store */ + "ldr r0, [sp, #36]\n\t" + "stm r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #40\n\t" + : + : + : "memory", "lr" + ); +} + +#else void fe_mul_op(void); void fe_mul_op() { __asm__ __volatile__ ( "sub sp, sp, #44\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r0, [sp, #36]\n\t" "str r1, [sp, #40]\n\t" #else @@ -2580,7 +3044,11 @@ void fe_mul_op() "umaal r4, r11, r7, lr\n\t" "umaal r5, r11, r8, lr\n\t" "pop {r6}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r10, r10, #0x80000000\n\t" +#else "bfc r10, #31, #1\n\t" +#endif "umaal r6, r11, r9, lr\n\t" "add r7, r10, r11\n\t" "ldr lr, [sp, #8]\n\t" @@ -2593,6 +3061,7 @@ void fe_mul_op() ); } +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ void fe_mul(fe r_p, const fe a_p, const fe b_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -2607,6 +3076,284 @@ void fe_mul(fe r_p, const fe a_p, const fe b_p) ); } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +void fe_sq_op(void); +void fe_sq_op() +{ + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str r0, [sp, #64]\n\t" + /* Square */ + "mov r0, #0\n\t" + "ldr r12, [r1]\n\t" + /* A[0] * A[1] */ + "ldr lr, [r1, #4]\n\t" + "umull r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "ldr lr, [r1, #12]\n\t" + "umull r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "umull r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "umull r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "ldr lr, [r1, #8]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[0] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[0] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + "adcs r3, r3, #0\n\t" + "str r4, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "ldr r12, [r1, #4]\n\t" + "ldr lr, [r1, #8]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * A[3] */ + "ldr lr, [r1, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r4, r0, #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "ldr r12, [r1, #8]\n\t" + "ldr lr, [r1, #12]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r5, r0, #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "ldr r12, [r1, #12]\n\t" + "ldr lr, [r1, #16]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r6, r0, #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "ldr r12, [r1, #16]\n\t" + "ldr lr, [r1, #20]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r7, r0, #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "ldr r12, [r1, #20]\n\t" + "ldr lr, [r1, #24]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r8, r0, #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "ldr r12, [r1, #24]\n\t" + "ldr lr, [r1, #28]\n\t" + "mov r9, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "stm lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r0, #0\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov lr, sp\n\t" + /* A[0] * A[0] */ + "ldr r12, [r1]\n\t" + "umull r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[1] * A[1] */ + "ldr r12, [r1, #4]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * A[2] */ + "ldr r12, [r1, #8]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * A[3] */ + "ldr r12, [r1, #12]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, r12\n\t" + "adds r10, r10, r11\n\t" + "stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "ldr r12, [r1, #16]\n\t" + "adcs r3, r3, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * A[5] */ + "ldr r12, [r1, #20]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * A[6] */ + "ldr r12, [r1, #24]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * A[7] */ + "ldr r12, [r1, #28]\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r12, r12\n\t" + /* Reduce */ + "ldr r2, [sp, #28]\n\t" + "mov lr, sp\n\t" + "mov r12, #38\n\t" + "umull r10, r11, r12, r10\n\t" + "adds r10, r10, r2\n\t" + "adc r11, r11, #0\n\t" + "mov r12, #19\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, LSR #31\n\t" + "mul r11, r12, r11\n\t" + "ldm lr!, {r1, r2}\n\t" + "mov r12, #38\n\t" + "adds r1, r1, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r1, r11, r3, r12\n\t" + "adds r2, r2, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r2, r11, r4, r12\n\t" + "ldm lr!, {r3, r4}\n\t" + "adds r3, r3, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r5, r12\n\t" + "adds r4, r4, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r6, r12\n\t" + "ldm lr!, {r5, r6}\n\t" + "adds r5, r5, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r7, r12\n\t" + "adds r6, r6, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r8, r12\n\t" + "ldm lr!, {r7, r8}\n\t" + "adds r7, r7, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r9, r12\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r10, r10, #0x80000000\n\t" +#else + "bfc r10, #31, #1\n\t" +#endif + "adds r8, r10, r11\n\t" + /* Store */ + "ldr r0, [sp, #64]\n\t" + "stm r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x44\n\t" + : + : + : "memory", "lr" + ); +} + +#else void fe_sq_op(void); void fe_sq_op() { @@ -2629,7 +3376,7 @@ void fe_sq_op() "umaal r9, r12, r1, r2\n\t" "adcs r9, r9, r9\n\t" "umaal r9, r11, lr, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #8]\n\t" "str r9, [sp, #12]\n\t" #else @@ -2719,7 +3466,11 @@ void fe_sq_op() "mov r12, r6\n\t" "pop {r5-r6}\n\t" "umaal r5, lr, r8, r12\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r7, r7, #0x80000000\n\t" +#else "bfc r7, #31, #1\n\t" +#endif "umaal r6, lr, r9, r12\n\t" "add r7, r7, lr\n\t" "pop {lr}\n\t" @@ -2731,6 +3482,7 @@ void fe_sq_op() ); } +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ void fe_sq(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -2745,6 +3497,7 @@ void fe_sq(fe r_p, const fe a_p) } #ifdef HAVE_CURVE25519 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) void fe_mul121666(fe r_p, fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -2753,34 +3506,59 @@ void fe_mul121666(fe r_p, fe a_p) __asm__ __volatile__ ( /* Multiply by 121666 */ "ldm %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "mov lr, #0xdb\n\t" - "lsl lr, lr, #8\n\t" - "add lr, lr, #0x42\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #1\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xdb\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0x42\n\t" #else - "mov lr, #0xdb42\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xdb\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x42\n\t" +#else + "mov r10, #0xdb42\n\t" #endif - "movt lr, #1\n\t" - "umull r2, r10, r2, lr\n\t" - "sub r12, lr, #1\n\t" - "umaal r3, r10, r3, r12\n\t" - "umaal r4, r10, r4, r12\n\t" - "umaal r5, r10, r5, r12\n\t" - "umaal r6, r10, r6, r12\n\t" - "umaal r7, r10, r7, r12\n\t" - "umaal r8, r10, r8, r12\n\t" - "mov lr, #19\n\t" - "umaal r9, r10, r9, r12\n\t" - "lsl r10, r10, #1\n\t" - "orr r10, r10, r9, lsr #31\n\t" - "mul r10, r10, lr\n\t" - "adds r2, r2, r10\n\t" + "movt r10, #1\n\t" +#endif + "umull r2, r12, r10, r2\n\t" + "umull r3, lr, r10, r3\n\t" + "adds r3, r3, r12\n\t" + "adc lr, lr, #0\n\t" + "umull r4, r12, r10, r4\n\t" + "adds r4, r4, lr\n\t" + "adc r12, r12, #0\n\t" + "umull r5, lr, r10, r5\n\t" + "adds r5, r5, r12\n\t" + "adc lr, lr, #0\n\t" + "umull r6, r12, r10, r6\n\t" + "adds r6, r6, lr\n\t" + "adc r12, r12, #0\n\t" + "umull r7, lr, r10, r7\n\t" + "adds r7, r7, r12\n\t" + "adc lr, lr, #0\n\t" + "umull r8, r12, r10, r8\n\t" + "adds r8, r8, lr\n\t" + "adc r12, r12, #0\n\t" + "umull r9, lr, r10, r9\n\t" + "adds r9, r9, r12\n\t" + "mov r10, #19\n\t" + "adc lr, lr, #0\n\t" + "lsl lr, lr, #1\n\t" + "orr lr, lr, r9, LSR #31\n\t" + "mul lr, r10, lr\n\t" + "adds r2, r2, lr\n\t" "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" "adcs r5, r5, #0\n\t" "adcs r6, r6, #0\n\t" "adcs r7, r7, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else "bfc r9, #31, #1\n\t" +#endif "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" "stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" @@ -2790,6 +3568,65 @@ void fe_mul121666(fe r_p, fe a_p) ); } +#else +void fe_mul121666(fe r_p, fe a_p) +{ + register sword32* r asm ("r0") = (sword32*)r_p; + register sword32* a asm ("r1") = (sword32*)a_p; + + __asm__ __volatile__ ( + /* Multiply by 121666 */ + "ldm %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #1\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xdb\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0x42\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #0xdb\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0x42\n\t" +#else + "mov lr, #0xdb42\n\t" +#endif + "movt lr, #1\n\t" +#endif + "umull r2, r10, lr, r2\n\t" + "sub r12, lr, #1\n\t" + "umaal r3, r10, r12, r3\n\t" + "umaal r4, r10, r12, r4\n\t" + "umaal r5, r10, r12, r5\n\t" + "umaal r6, r10, r12, r6\n\t" + "umaal r7, r10, r12, r7\n\t" + "umaal r8, r10, r12, r8\n\t" + "mov lr, #19\n\t" + "umaal r9, r10, r12, r9\n\t" + "lsl r10, r10, #1\n\t" + "orr r10, r10, r9, lsr #31\n\t" + "mul r10, lr, r10\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0x80000000\n\t" +#else + "bfc r9, #31, #1\n\t" +#endif + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "stm %[r], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + ); +} + +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #ifndef WC_NO_CACHE_RESISTANT int curve25519(byte* r_p, const byte* n_p, const byte* a_p) { @@ -3476,7 +4313,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "adcs r10, r10, #0\n\t" "adc r11, r11, #0\n\t" "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -3662,6 +4503,327 @@ void fe_invert(fe r_p, const fe a_p) ); } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +void fe_sq2(fe r_p, const fe a_p) +{ + register sword32* r asm ("r0") = (sword32*)r_p; + register const sword32* a asm ("r1") = (const sword32*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str r0, [sp, #64]\n\t" + /* Square * 2 */ + "mov r0, #0\n\t" + "ldr r12, [r1]\n\t" + /* A[0] * A[1] */ + "ldr lr, [r1, #4]\n\t" + "umull r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "ldr lr, [r1, #12]\n\t" + "umull r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "umull r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "umull r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "ldr lr, [r1, #8]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[0] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[0] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + "adcs r3, r3, #0\n\t" + "str r4, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "ldr r12, [r1, #4]\n\t" + "ldr lr, [r1, #8]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * A[3] */ + "ldr lr, [r1, #12]\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r4, r0, #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "ldr r12, [r1, #8]\n\t" + "ldr lr, [r1, #12]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * A[4] */ + "ldr lr, [r1, #16]\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r5, r0, #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "ldr r12, [r1, #12]\n\t" + "ldr lr, [r1, #16]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * A[5] */ + "ldr lr, [r1, #20]\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r6, r0, #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "ldr r12, [r1, #16]\n\t" + "ldr lr, [r1, #20]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * A[6] */ + "ldr lr, [r1, #24]\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r7, r0, #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "ldr r12, [r1, #20]\n\t" + "ldr lr, [r1, #24]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * A[7] */ + "ldr lr, [r1, #28]\n\t" + "adc r8, r0, #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "ldr r12, [r1, #24]\n\t" + "ldr lr, [r1, #28]\n\t" + "mov r9, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "stm lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, r0, #0\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov lr, sp\n\t" + /* A[0] * A[0] */ + "ldr r12, [r1]\n\t" + "umull r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[1] * A[1] */ + "ldr r12, [r1, #4]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * A[2] */ + "ldr r12, [r1, #8]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * A[3] */ + "ldr r12, [r1, #12]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r9, r11, r12, r12\n\t" + "adds r10, r10, r11\n\t" + "stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "ldr r12, [r1, #16]\n\t" + "adcs r3, r3, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * A[5] */ + "ldr r12, [r1, #20]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * A[6] */ + "ldr r12, [r1, #24]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * A[7] */ + "ldr r12, [r1, #28]\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r12, r12\n\t" + /* Reduce */ + "ldr r2, [sp, #28]\n\t" + "mov lr, sp\n\t" + "mov r12, #38\n\t" + "umull r10, r11, r12, r10\n\t" + "adds r10, r10, r2\n\t" + "adc r11, r11, #0\n\t" + "mov r12, #19\n\t" + "lsl r11, r11, #1\n\t" + "orr r11, r11, r10, LSR #31\n\t" + "mul r11, r12, r11\n\t" + "ldm lr!, {r1, r2}\n\t" + "mov r12, #38\n\t" + "adds r1, r1, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r1, r11, r3, r12\n\t" + "adds r2, r2, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r2, r11, r4, r12\n\t" + "ldm lr!, {r3, r4}\n\t" + "adds r3, r3, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r3, r11, r5, r12\n\t" + "adds r4, r4, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r4, r11, r6, r12\n\t" + "ldm lr!, {r5, r6}\n\t" + "adds r5, r5, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r5, r11, r7, r12\n\t" + "adds r6, r6, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r6, r11, r8, r12\n\t" + "ldm lr!, {r7, r8}\n\t" + "adds r7, r7, r11\n\t" + "adc r11, r0, #0\n\t" + "umlal r7, r11, r9, r12\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r10, r10, #0x80000000\n\t" +#else + "bfc r10, #31, #1\n\t" +#endif + "adds r8, r10, r11\n\t" + /* Reduce if top bit set */ + "mov r12, #19\n\t" + "and r11, r12, r8, ASR #31\n\t" + "adds r1, r1, r11\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r8, r8, #0x80000000\n\t" +#else + "bfc r8, #31, #1\n\t" +#endif + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + /* Double */ + "adds r1, r1, r1\n\t" + "adcs r2, r2, r2\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adc r8, r8, r8\n\t" + /* Reduce if top bit set */ + "mov r12, #19\n\t" + "and r11, r12, r8, ASR #31\n\t" + "adds r1, r1, r11\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r8, r8, #0x80000000\n\t" +#else + "bfc r8, #31, #1\n\t" +#endif + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + /* Store */ + "ldr r0, [sp, #64]\n\t" + "stm r0, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "lr" + ); +} + +#else void fe_sq2(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -3669,7 +4831,7 @@ void fe_sq2(fe r_p, const fe a_p) __asm__ __volatile__ ( "sub sp, sp, #36\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r0, [sp, #28]\n\t" "str r1, [sp, #32]\n\t" #else @@ -3691,7 +4853,7 @@ void fe_sq2(fe r_p, const fe a_p) "umaal r9, r12, r1, r2\n\t" "adcs r9, r9, r9\n\t" "umaal r9, r11, lr, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #8]\n\t" "str r9, [sp, #12]\n\t" #else @@ -3781,7 +4943,11 @@ void fe_sq2(fe r_p, const fe a_p) "mov r12, r6\n\t" "pop {r5-r6}\n\t" "umaal r5, lr, r8, r12\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r7, r7, #0x80000000\n\t" +#else "bfc r7, #31, #1\n\t" +#endif "umaal r6, lr, r9, r12\n\t" "add r7, r7, lr\n\t" /* Reduce if top bit set */ @@ -3793,7 +4959,11 @@ void fe_sq2(fe r_p, const fe a_p) "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" "adcs r5, r5, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r7, r7, #0x80000000\n\t" +#else "bfc r7, #31, #1\n\t" +#endif "adcs r6, r6, #0\n\t" "adc r7, r7, #0\n\t" /* Double */ @@ -3814,7 +4984,11 @@ void fe_sq2(fe r_p, const fe a_p) "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" "adcs r5, r5, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r7, r7, #0x80000000\n\t" +#else "bfc r7, #31, #1\n\t" +#endif "adcs r6, r6, #0\n\t" "adc r7, r7, #0\n\t" "pop {r12, lr}\n\t" @@ -3828,6 +5002,7 @@ void fe_sq2(fe r_p, const fe a_p) ); } +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ void fe_pow22523(fe r_p, const fe a_p) { register sword32* r asm ("r0") = (sword32*)r_p; @@ -4177,7 +5352,11 @@ void ge_madd(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "adcs r10, r10, #0\n\t" "adc r11, r11, #0\n\t" "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -4259,7 +5438,11 @@ void ge_msub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_precomp * q_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "adcs r10, r10, #0\n\t" "adc r11, r11, #0\n\t" "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -4337,7 +5520,11 @@ void ge_add(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "adcs r10, r10, #0\n\t" "adc r11, r11, #0\n\t" "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -4420,7 +5607,11 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0x80000000\n\t" +#else "bfc r11, #31, #1\n\t" +#endif "adcs r10, r10, #0\n\t" "adc r11, r11, #0\n\t" "stm r0, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" @@ -4441,36 +5632,51 @@ void ge_sub(ge_p1p1 * r_p, const ge_p3 * p_p, const ge_cached* q_p) ); } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) void sc_reduce(byte* s_p) { register byte* s asm ("r0") = (byte*)s_p; __asm__ __volatile__ ( - "sub sp, sp, #52\n\t" + "sub sp, sp, #56\n\t" + "str %[s], [sp, #52]\n\t" /* Load bits 252-511 */ "add %[s], %[s], #28\n\t" "ldm %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" "lsr lr, r9, #24\n\t" "lsl r9, r9, #4\n\t" - "orr r9, r9, r8, lsr #28\n\t" + "orr r9, r9, r8, LSR #28\n\t" "lsl r8, r8, #4\n\t" - "orr r8, r8, r7, lsr #28\n\t" + "orr r8, r8, r7, LSR #28\n\t" "lsl r7, r7, #4\n\t" - "orr r7, r7, r6, lsr #28\n\t" + "orr r7, r7, r6, LSR #28\n\t" "lsl r6, r6, #4\n\t" - "orr r6, r6, r5, lsr #28\n\t" + "orr r6, r6, r5, LSR #28\n\t" "lsl r5, r5, #4\n\t" - "orr r5, r5, r4, lsr #28\n\t" + "orr r5, r5, r4, LSR #28\n\t" "lsl r4, r4, #4\n\t" - "orr r4, r4, r3, lsr #28\n\t" + "orr r4, r4, r3, LSR #28\n\t" "lsl r3, r3, #4\n\t" - "orr r3, r3, r2, lsr #28\n\t" + "orr r3, r3, r2, LSR #28\n\t" "lsl r2, r2, #4\n\t" - "orr r2, r2, r1, lsr #28\n\t" + "orr r2, r2, r1, LSR #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else "bfc r9, #28, #4\n\t" +#endif "sub %[s], %[s], #28\n\t" /* Add order times bits 504..511 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xa3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #10\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #44\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x2c\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0x13\n\t" @@ -4478,7 +5684,17 @@ void sc_reduce(byte* s_p) "mov r10, #0x2c13\n\t" #endif "movt r10, #0xa30a\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xa7\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xed\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x9c\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0xe5\n\t" @@ -4486,10 +5702,23 @@ void sc_reduce(byte* s_p) "mov r11, #0x9ce5\n\t" #endif "movt r11, #0xa7ed\n\t" +#endif "mov r1, #0\n\t" "umlal r2, r1, r10, lr\n\t" - "umaal r3, r1, r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "adds r3, r3, r1\n\t" + "mov r1, #0\n\t" + "adc r1, r1, #0\n\t" + "umlal r3, r1, r11, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5d\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #8\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x63\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0x29\n\t" @@ -4497,7 +5726,17 @@ void sc_reduce(byte* s_p) "mov r10, #0x6329\n\t" #endif "movt r10, #0x5d08\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xeb\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #6\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x6\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0x21\n\t" @@ -4505,8 +5744,15 @@ void sc_reduce(byte* s_p) "mov r11, #0x621\n\t" #endif "movt r11, #0xeb21\n\t" - "umaal r4, r1, r10, lr\n\t" - "umaal r5, r1, r11, lr\n\t" +#endif + "adds r4, r4, r1\n\t" + "mov r1, #0\n\t" + "adc r1, r1, #0\n\t" + "umlal r4, r1, r10, lr\n\t" + "adds r5, r5, r1\n\t" + "mov r1, #0\n\t" + "adc r1, r1, #0\n\t" + "umlal r5, r1, r11, lr\n\t" "adds r6, r6, r1\n\t" "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" @@ -4516,7 +5762,17 @@ void sc_reduce(byte* s_p) "sbcs r8, r8, #0\n\t" "sbc r9, r9, #0\n\t" /* Sub product of top 8 words and order */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r12, sp\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa3\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #10\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #44\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x2c\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x13\n\t" @@ -4524,25 +5780,62 @@ void sc_reduce(byte* s_p) "mov r1, #0x2c13\n\t" #endif "movt r1, #0xa30a\n\t" +#endif "mov lr, #0\n\t" - "ldm %[s]!, {r10, r11, r12}\n\t" - "umlal r10, lr, r2, r1\n\t" - "umaal r11, lr, r3, r1\n\t" - "umaal r12, lr, r4, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm %[s]!, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, r1\n\t" - "umaal r11, lr, r6, r1\n\t" - "umaal r12, lr, r7, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" "ldm %[s]!, {r10, r11}\n\t" - "umaal r10, lr, r8, r1\n\t" + "umlal r10, lr, r2, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, r1\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0xf0000000\n\t" +#else "bfc r11, #28, #4\n\t" - "umaal r11, lr, r9, r1\n\t" - "stm sp!, {r10, r11, lr}\n\t" +#endif + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" "sub %[s], %[s], #16\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa7\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xed\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x9c\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0xe5\n\t" @@ -4550,23 +5843,56 @@ void sc_reduce(byte* s_p) "mov r1, #0x9ce5\n\t" #endif "movt r1, #0xa7ed\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, r2, r1\n\t" - "umaal r11, lr, r3, r1\n\t" - "umaal r12, lr, r4, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, r1\n\t" - "umaal r11, lr, r6, r1\n\t" - "umaal r12, lr, r7, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" - "umaal r10, lr, r8, r1\n\t" - "umaal r11, lr, r9, r1\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x5d\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #8\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x63\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x29\n\t" @@ -4574,23 +5900,56 @@ void sc_reduce(byte* s_p) "mov r1, #0x6329\n\t" #endif "movt r1, #0x5d08\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, r2, r1\n\t" - "umaal r11, lr, r3, r1\n\t" - "umaal r12, lr, r4, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, r1\n\t" - "umaal r11, lr, r6, r1\n\t" - "umaal r12, lr, r7, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" - "umaal r10, lr, r8, r1\n\t" - "umaal r11, lr, r9, r1\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xeb\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #6\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x6\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x21\n\t" @@ -4598,48 +5957,83 @@ void sc_reduce(byte* s_p) "mov r1, #0x621\n\t" #endif "movt r1, #0xeb21\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, r2, r1\n\t" - "umaal r11, lr, r3, r1\n\t" - "umaal r12, lr, r4, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, r1\n\t" - "umaal r11, lr, r6, r1\n\t" - "umaal r12, lr, r7, r1\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" - "umaal r10, lr, r8, r1\n\t" - "umaal r11, lr, r9, r1\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, r1\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" /* Subtract at 4 * 32 */ - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "subs r10, r10, r2\n\t" "sbcs r11, r11, r3\n\t" - "sbcs r12, r12, r4\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "sbcs r10, r10, r5\n\t" - "sbcs r11, r11, r6\n\t" - "sbcs r12, r12, r7\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r4\n\t" + "sbcs r11, r11, r5\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r6\n\t" + "sbcs r11, r11, r7\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "sbcs r10, r10, r8\n\t" "sbc r11, r11, r9\n\t" - "stm sp!, {r10, r11}\n\t" - "sub sp, sp, #36\n\t" + "stm r12!, {r10, r11}\n\t" + "sub r12, r12, #36\n\t" "asr lr, r11, #25\n\t" /* Conditionally subtract order starting at bit 125 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0xa00000\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x0\n\t" #else "mov r1, #0xa0000000\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r2, #0x4b\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0x9e\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0xba\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0x7d\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0xba\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x7d\n\t" @@ -4647,7 +6041,17 @@ void sc_reduce(byte* s_p) "mov r2, #0xba7d\n\t" #endif "movt r2, #0x4b9e\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r3, #0xcb\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #2\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #0x4c\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #0x63\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x4c\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0x63\n\t" @@ -4655,7 +6059,17 @@ void sc_reduce(byte* s_p) "mov r3, #0x4c63\n\t" #endif "movt r3, #0xcb02\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xd4\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x5e\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x9a\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r4, #0xf3\n\t" "lsl r4, r4, #8\n\t" "add r4, r4, #0x9a\n\t" @@ -4663,7 +6077,17 @@ void sc_reduce(byte* s_p) "mov r4, #0xf39a\n\t" #endif "movt r4, #0xd45e\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #2\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0x9b\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #59\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r5, #0xdf\n\t" "lsl r5, r5, #8\n\t" "add r5, r5, #0x3b\n\t" @@ -4671,7 +6095,8 @@ void sc_reduce(byte* s_p) "mov r5, #0xdf3b\n\t" #endif "movt r5, #0x29b\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r9, #0x20000\n\t" "lsl r9, r9, #8\n\t" "add r9, r9, #0x0\n\t" @@ -4684,26 +6109,30 @@ void sc_reduce(byte* s_p) "and r4, r4, lr\n\t" "and r5, r5, lr\n\t" "and r9, r9, lr\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "adds r10, r10, r1\n\t" "adcs r11, r11, r2\n\t" - "adcs r12, r12, r3\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "adcs r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adcs r12, r12, #0\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r3\n\t" + "adcs r11, r11, r4\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r5\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "adcs r10, r10, #0\n\t" "adcs r11, r11, #0\n\t" - "adcs r12, r12, r9\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "sub sp, sp, #48\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10}\n\t" + "adcs r10, r10, #0\n\t" + "stm r12!, {r10}\n\t" "sub %[s], %[s], #16\n\t" + "mov r12, sp\n\t" /* Load bits 252-376 */ - "add sp, sp, #28\n\t" - "ldm sp, {r1, r2, r3, r4, r5}\n\t" + "add r12, r12, #28\n\t" + "ldm r12, {r1, r2, r3, r4, r5}\n\t" "lsl r5, r5, #4\n\t" "orr r5, r5, r4, lsr #28\n\t" "lsl r4, r4, #4\n\t" @@ -4712,11 +6141,25 @@ void sc_reduce(byte* s_p) "orr r3, r3, r2, lsr #28\n\t" "lsl r2, r2, #4\n\t" "orr r2, r2, r1, lsr #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r5, r5, #0xe0000000\n\t" +#else "bfc r5, #29, #3\n\t" - "sub sp, sp, #28\n\t" - /* Sub product of top 8 words and order */ +#endif + "sub r12, r12, #28\n\t" + /* Sub product of top 4 words and order */ + "mov %[s], sp\n\t" /* * -5cf5d3ed */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa3\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #10\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #44\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x2c\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x13\n\t" @@ -4724,16 +6167,35 @@ void sc_reduce(byte* s_p) "mov r1, #0x2c13\n\t" #endif "movt r1, #0xa30a\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, lr, r2, r1\n\t" - "umaal r7, lr, r3, r1\n\t" - "umaal r8, lr, r4, r1\n\t" - "umaal r9, lr, r5, r1\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "adds r7, r7, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r7, lr, r3, r1\n\t" + "adds r8, r8, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r8, lr, r4, r1\n\t" + "adds r9, r9, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r9, lr, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -5812631b */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa7\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xed\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x9c\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0xe5\n\t" @@ -4741,16 +6203,35 @@ void sc_reduce(byte* s_p) "mov r1, #0x9ce5\n\t" #endif "movt r1, #0xa7ed\n\t" +#endif "mov r10, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r10, r2, r1\n\t" - "umaal r7, r10, r3, r1\n\t" - "umaal r8, r10, r4, r1\n\t" - "umaal r9, r10, r5, r1\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "adds r7, r7, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r7, r10, r3, r1\n\t" + "adds r8, r8, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r8, r10, r4, r1\n\t" + "adds r9, r9, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -a2f79cd7 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x5d\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #8\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x63\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x29\n\t" @@ -4758,16 +6239,35 @@ void sc_reduce(byte* s_p) "mov r1, #0x6329\n\t" #endif "movt r1, #0x5d08\n\t" +#endif "mov r11, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r11, r2, r1\n\t" - "umaal r7, r11, r3, r1\n\t" - "umaal r8, r11, r4, r1\n\t" - "umaal r9, r11, r5, r1\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "adds r7, r7, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r7, r11, r3, r1\n\t" + "adds r8, r8, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r8, r11, r4, r1\n\t" + "adds r9, r9, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r9, r11, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -14def9df */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xeb\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #6\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r1, #0x6\n\t" "lsl r1, r1, #8\n\t" "add r1, r1, #0x21\n\t" @@ -4775,17 +6275,31 @@ void sc_reduce(byte* s_p) "mov r1, #0x621\n\t" #endif "movt r1, #0xeb21\n\t" +#endif "mov r12, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r12, r2, r1\n\t" - "umaal r7, r12, r3, r1\n\t" - "umaal r8, r12, r4, r1\n\t" - "umaal r9, r12, r5, r1\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "adds r7, r7, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r7, r12, r3, r1\n\t" + "adds r8, r8, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r8, r12, r4, r1\n\t" + "adds r9, r9, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r9, r12, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* Add overflows at 4 * 32 */ - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else "bfc r9, #28, #4\n\t" +#endif "adds r6, r6, lr\n\t" "adcs r7, r7, r10\n\t" "adcs r8, r8, r11\n\t" @@ -4796,9 +6310,18 @@ void sc_reduce(byte* s_p) "sbcs r8, r8, r4\n\t" "sbcs r9, r9, r5\n\t" "sbc r1, r1, r1\n\t" - "sub sp, sp, #16\n\t" - "ldm sp, {r2, r3, r4, r5}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub %[s], %[s], #16\n\t" + "ldm %[s], {r2, r3, r4, r5}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5c\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xf5\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xed\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0xd3\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0xed\n\t" @@ -4806,7 +6329,17 @@ void sc_reduce(byte* s_p) "mov r10, #0xd3ed\n\t" #endif "movt r10, #0x5cf5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x58\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #18\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #26\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x63\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0x1a\n\t" @@ -4814,7 +6347,17 @@ void sc_reduce(byte* s_p) "mov r11, #0x631a\n\t" #endif "movt r11, #0x5812\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0xa2\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xf7\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xd6\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r12, #0x9c\n\t" "lsl r12, r12, #8\n\t" "add r12, r12, #0xd6\n\t" @@ -4822,7 +6365,17 @@ void sc_reduce(byte* s_p) "mov r12, #0x9cd6\n\t" #endif "movt r12, #0xa2f7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #20\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov lr, #0xf9\n\t" "lsl lr, lr, #8\n\t" "add lr, lr, #0xde\n\t" @@ -4830,6 +6383,7 @@ void sc_reduce(byte* s_p) "mov lr, #0xf9de\n\t" #endif "movt lr, #0x14de\n\t" +#endif "and r10, r10, r1\n\t" "and r11, r11, r1\n\t" "and r12, r12, r1\n\t" @@ -4843,17 +6397,675 @@ void sc_reduce(byte* s_p) "and r1, r1, #0x10000000\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, r1\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else "bfc r9, #28, #4\n\t" +#endif /* Store result */ + "ldr %[s], [sp, #52]\n\t" "stm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - "add sp, sp, #52\n\t" + "add sp, sp, #56\n\t" : [s] "+r" (s) : : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#else +void sc_reduce(byte* s_p) +{ + register byte* s asm ("r0") = (byte*)s_p; + + __asm__ __volatile__ ( + "sub sp, sp, #56\n\t" + "str %[s], [sp, #52]\n\t" + /* Load bits 252-511 */ + "add %[s], %[s], #28\n\t" + "ldm %[s], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "lsr lr, r9, #24\n\t" + "lsl r9, r9, #4\n\t" + "orr r9, r9, r8, LSR #28\n\t" + "lsl r8, r8, #4\n\t" + "orr r8, r8, r7, LSR #28\n\t" + "lsl r7, r7, #4\n\t" + "orr r7, r7, r6, LSR #28\n\t" + "lsl r6, r6, #4\n\t" + "orr r6, r6, r5, LSR #28\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, LSR #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, r3, LSR #28\n\t" + "lsl r3, r3, #4\n\t" + "orr r3, r3, r2, LSR #28\n\t" + "lsl r2, r2, #4\n\t" + "orr r2, r2, r1, LSR #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + "sub %[s], %[s], #28\n\t" + /* Add order times bits 504..511 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xa3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #10\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #44\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x2c\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x13\n\t" +#else + "mov r10, #0x2c13\n\t" +#endif + "movt r10, #0xa30a\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xa7\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xed\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0xe5\n\t" +#else + "mov r11, #0x9ce5\n\t" +#endif + "movt r11, #0xa7ed\n\t" +#endif + "mov r1, #0\n\t" + "umlal r2, r1, r10, lr\n\t" + "umaal r3, r1, r11, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5d\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #8\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x29\n\t" +#else + "mov r10, #0x6329\n\t" +#endif + "movt r10, #0x5d08\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xeb\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #6\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x6\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x21\n\t" +#else + "mov r11, #0x621\n\t" +#endif + "movt r11, #0xeb21\n\t" +#endif + "umaal r4, r1, r10, lr\n\t" + "umaal r5, r1, r11, lr\n\t" + "adds r6, r6, r1\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "subs r6, r6, lr\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbc r9, r9, #0\n\t" + /* Sub product of top 8 words and order */ + "mov r12, sp\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa3\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #10\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #44\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x2c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x13\n\t" +#else + "mov r1, #0x2c13\n\t" +#endif + "movt r1, #0xa30a\n\t" +#endif + "mov lr, #0\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r4, r1\n\t" + "umaal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r6, r1\n\t" + "umaal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0xf0000000\n\t" +#else + "bfc r11, #28, #4\n\t" +#endif + "umaal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub %[s], %[s], #16\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa7\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xed\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0xe5\n\t" +#else + "mov r1, #0x9ce5\n\t" +#endif + "movt r1, #0xa7ed\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, r1\n\t" + "umaal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, r1\n\t" + "umaal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x5d\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #8\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x29\n\t" +#else + "mov r1, #0x6329\n\t" +#endif + "movt r1, #0x5d08\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, r1\n\t" + "umaal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, r1\n\t" + "umaal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xeb\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #6\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x6\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x21\n\t" +#else + "mov r1, #0x621\n\t" +#endif + "movt r1, #0xeb21\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, r2, r1\n\t" + "umaal r11, lr, r3, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, r1\n\t" + "umaal r11, lr, r5, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, r1\n\t" + "umaal r11, lr, r7, r1\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r8, r1\n\t" + "umaal r11, lr, r9, r1\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" + /* Subtract at 4 * 32 */ + "ldm r12, {r10, r11}\n\t" + "subs r10, r10, r2\n\t" + "sbcs r11, r11, r3\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r4\n\t" + "sbcs r11, r11, r5\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r6\n\t" + "sbcs r11, r11, r7\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r8\n\t" + "sbc r11, r11, r9\n\t" + "stm r12!, {r10, r11}\n\t" + "sub r12, r12, #36\n\t" + "asr lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa00000\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x0\n\t" +#else + "mov r1, #0xa0000000\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r2, #0x4b\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0x9e\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0xba\n\t" + "lsl r2, r2, #8\n\t" + "orr r2, r2, #0x7d\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r2, #0xba\n\t" + "lsl r2, r2, #8\n\t" + "add r2, r2, #0x7d\n\t" +#else + "mov r2, #0xba7d\n\t" +#endif + "movt r2, #0x4b9e\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r3, #0xcb\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #2\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #0x4c\n\t" + "lsl r3, r3, #8\n\t" + "orr r3, r3, #0x63\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r3, #0x4c\n\t" + "lsl r3, r3, #8\n\t" + "add r3, r3, #0x63\n\t" +#else + "mov r3, #0x4c63\n\t" +#endif + "movt r3, #0xcb02\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xd4\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x5e\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x9a\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "add r4, r4, #0x9a\n\t" +#else + "mov r4, #0xf39a\n\t" +#endif + "movt r4, #0xd45e\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #2\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0x9b\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #59\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "add r5, r5, #0x3b\n\t" +#else + "mov r5, #0xdf3b\n\t" +#endif + "movt r5, #0x29b\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r9, #0x20000\n\t" + "lsl r9, r9, #8\n\t" + "add r9, r9, #0x0\n\t" +#else + "mov r9, #0x2000000\n\t" +#endif + "and r1, r1, lr\n\t" + "and r2, r2, lr\n\t" + "and r3, r3, lr\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r9, r9, lr\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, r1\n\t" + "adcs r11, r11, r2\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r3\n\t" + "adcs r11, r11, r4\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r5\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10}\n\t" + "adcs r10, r10, #0\n\t" + "stm r12!, {r10}\n\t" + "sub %[s], %[s], #16\n\t" + "mov r12, sp\n\t" + /* Load bits 252-376 */ + "add r12, r12, #28\n\t" + "ldm r12, {r1, r2, r3, r4, r5}\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, r3, lsr #28\n\t" + "lsl r3, r3, #4\n\t" + "orr r3, r3, r2, lsr #28\n\t" + "lsl r2, r2, #4\n\t" + "orr r2, r2, r1, lsr #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r5, r5, #0xe0000000\n\t" +#else + "bfc r5, #29, #3\n\t" +#endif + "sub r12, r12, #28\n\t" + /* Sub product of top 4 words and order */ + "mov %[s], sp\n\t" + /* * -5cf5d3ed */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa3\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #10\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #44\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x2c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x13\n\t" +#else + "mov r1, #0x2c13\n\t" +#endif + "movt r1, #0xa30a\n\t" +#endif + "mov lr, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, lr, r2, r1\n\t" + "umaal r7, lr, r3, r1\n\t" + "umaal r8, lr, r4, r1\n\t" + "umaal r9, lr, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -5812631b */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xa7\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xed\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x9c\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0xe5\n\t" +#else + "mov r1, #0x9ce5\n\t" +#endif + "movt r1, #0xa7ed\n\t" +#endif + "mov r10, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r10, r2, r1\n\t" + "umaal r7, r10, r3, r1\n\t" + "umaal r8, r10, r4, r1\n\t" + "umaal r9, r10, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -a2f79cd7 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x5d\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #8\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x63\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x29\n\t" +#else + "mov r1, #0x6329\n\t" +#endif + "movt r1, #0x5d08\n\t" +#endif + "mov r11, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r11, r2, r1\n\t" + "umaal r7, r11, r3, r1\n\t" + "umaal r8, r11, r4, r1\n\t" + "umaal r9, r11, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -14def9df */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0xeb\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #6\n\t" + "lsl r1, r1, #8\n\t" + "orr r1, r1, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r1, #0x6\n\t" + "lsl r1, r1, #8\n\t" + "add r1, r1, #0x21\n\t" +#else + "mov r1, #0x621\n\t" +#endif + "movt r1, #0xeb21\n\t" +#endif + "mov r12, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r12, r2, r1\n\t" + "umaal r7, r12, r3, r1\n\t" + "umaal r8, r12, r4, r1\n\t" + "umaal r9, r12, r5, r1\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* Add overflows at 4 * 32 */ + "ldm %[s], {r6, r7, r8, r9}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + "adds r6, r6, lr\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adc r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "subs r6, r6, r2\n\t" + "sbcs r7, r7, r3\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r5\n\t" + "sbc r1, r1, r1\n\t" + "sub %[s], %[s], #16\n\t" + "ldm %[s], {r2, r3, r4, r5}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5c\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xf5\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xed\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0xed\n\t" +#else + "mov r10, #0xd3ed\n\t" +#endif + "movt r10, #0x5cf5\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x58\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #18\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #26\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x1a\n\t" +#else + "mov r11, #0x631a\n\t" +#endif + "movt r11, #0x5812\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0xa2\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xf7\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xd6\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "add r12, r12, #0xd6\n\t" +#else + "mov r12, #0x9cd6\n\t" +#endif + "movt r12, #0xa2f7\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #20\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0xde\n\t" +#else + "mov lr, #0xf9de\n\t" +#endif + "movt lr, #0x14de\n\t" +#endif + "and r10, r10, r1\n\t" + "and r11, r11, r1\n\t" + "and r12, r12, r1\n\t" + "and lr, lr, r1\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "and r1, r1, #0x10000000\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, r1\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + /* Store result */ + "ldr %[s], [sp, #52]\n\t" + "stm %[s], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "add sp, sp, #56\n\t" + : [s] "+r" (s) + : + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #ifdef HAVE_ED25519_SIGN +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) { register byte* s asm ("r0") = (byte*)s_p; @@ -4862,7 +7074,1147 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) register const byte* c asm ("r3") = (const byte*)c_p; __asm__ __volatile__ ( - "sub sp, sp, #0x70\n\t" + "sub sp, sp, #0x50\n\t" + "add lr, sp, #0x44\n\t" + "stm lr, {%[s], %[a], %[c]}\n\t" + "mov %[s], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "ldr lr, [%[b]]\n\t" + "umull %[c], r4, r12, lr\n\t" + /* A[0] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "umull r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "umull r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "umull r9, r10, r12, lr\n\t" + "str %[c], [sp]\n\t" + /* A[0] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "mov r11, %[s]\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[0] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adcs r6, r6, #0\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[0] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adcs r8, r8, #0\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[0] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adcs r10, r10, #0\n\t" + "adc %[c], %[s], #0\n\t" + "umlal r10, %[c], r12, lr\n\t" + /* A[1] * B[0] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "str r4, [sp, #4]\n\t" + "adds r5, r5, r11\n\t" + /* A[1] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[1] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[1] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r4, %[s], #0\n\t" + "umlal %[c], r4, r12, lr\n\t" + /* A[2] * B[0] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "str r5, [sp, #8]\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[2] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[2] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[2] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r5, %[s], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[3] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[3] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[3] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[3] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r6, %[s], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[4] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[4] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[4] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[4] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[4] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r7, %[s], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[5] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[5] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[5] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[5] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[5] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r8, %[s], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[6] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds %[c], %[c], r11\n\t" + /* A[6] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[6] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[6] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[6] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[6] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r9, %[s], #0\n\t" + "umlal r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "ldr r12, [%[a], #28]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds %[c], %[c], r11\n\t" + /* A[7] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[s], #0\n\t" + "umlal %[c], r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[7] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[7] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[7] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[7] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[s], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[7] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r10, %[s], #0\n\t" + "umlal r9, r10, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov %[s], sp\n\t" + /* Add c to a * b */ + "ldr lr, [sp, #76]\n\t" + "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm lr!, {%[a], r10, r11, r12}\n\t" + "adds %[b], %[b], %[a]\n\t" + "adcs %[c], %[c], r10\n\t" + "adcs r4, r4, r11\n\t" + "adcs r5, r5, r12\n\t" + "ldm lr!, {%[a], r10, r11, r12}\n\t" + "adcs r6, r6, %[a]\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adcs r9, r9, r12\n\t" + "mov %[a], r9\n\t" + "stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "adcs %[b], %[b], #0\n\t" + "adcs %[c], %[c], #0\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "sub %[s], %[s], #32\n\t" + /* Get 252..503 and 504..507 */ + "lsr lr, r9, #24\n\t" + "lsl r9, r9, #4\n\t" + "orr r9, r9, r8, LSR #28\n\t" + "lsl r8, r8, #4\n\t" + "orr r8, r8, r7, LSR #28\n\t" + "lsl r7, r7, #4\n\t" + "orr r7, r7, r6, LSR #28\n\t" + "lsl r6, r6, #4\n\t" + "orr r6, r6, r5, LSR #28\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, LSR #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, %[c], LSR #28\n\t" + "lsl %[c], %[c], #4\n\t" + "orr %[c], %[c], %[b], LSR #28\n\t" + "lsl %[b], %[b], #4\n\t" + "orr %[b], %[b], %[a], LSR #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + /* Add order times bits 504..507 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xa3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #10\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #44\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x2c\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x13\n\t" +#else + "mov r10, #0x2c13\n\t" +#endif + "movt r10, #0xa30a\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xa7\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xed\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0xe5\n\t" +#else + "mov r11, #0x9ce5\n\t" +#endif + "movt r11, #0xa7ed\n\t" +#endif + "mov %[a], #0\n\t" + "umlal %[b], %[a], r10, lr\n\t" + "adds %[c], %[c], %[a]\n\t" + "mov %[a], #0\n\t" + "adc %[a], %[a], #0\n\t" + "umlal %[c], %[a], r11, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5d\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #8\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0x29\n\t" +#else + "mov r10, #0x6329\n\t" +#endif + "movt r10, #0x5d08\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xeb\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #6\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x6\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x21\n\t" +#else + "mov r11, #0x621\n\t" +#endif + "movt r11, #0xeb21\n\t" +#endif + "adds r4, r4, %[a]\n\t" + "mov %[a], #0\n\t" + "adc %[a], %[a], #0\n\t" + "umlal r4, %[a], r10, lr\n\t" + "adds r5, r5, %[a]\n\t" + "mov %[a], #0\n\t" + "adc %[a], %[a], #0\n\t" + "umlal r5, %[a], r11, lr\n\t" + "adds r6, r6, %[a]\n\t" + "adcs r7, r7, #0\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, #0\n\t" + "subs r6, r6, lr\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbc r9, r9, #0\n\t" + /* Sub product of top 8 words and order */ + "mov r12, sp\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa3\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #10\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #44\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x2c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x13\n\t" +#else + "mov %[a], #0x2c13\n\t" +#endif + "movt %[a], #0xa30a\n\t" +#endif + "mov lr, #0\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, %[c], %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, %[a]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0xf0000000\n\t" +#else + "bfc r11, #28, #4\n\t" +#endif + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, %[a]\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub %[s], %[s], #16\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa7\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xed\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0xe5\n\t" +#else + "mov %[a], #0x9ce5\n\t" +#endif + "movt %[a], #0xa7ed\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, %[c], %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, %[a]\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x5d\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #8\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x29\n\t" +#else + "mov %[a], #0x6329\n\t" +#endif + "movt %[a], #0x5d08\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, %[c], %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, %[a]\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xeb\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #6\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x6\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x21\n\t" +#else + "mov %[a], #0x621\n\t" +#endif + "movt %[a], #0xeb21\n\t" +#endif + "mov lr, #0\n\t" + "ldm r12, {r10, r11}\n\t" + "umlal r10, lr, %[b], %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, %[c], %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r4, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r6, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r10, lr, r8, %[a]\n\t" + "adds r11, r11, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r11, lr, r9, %[a]\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" + /* Subtract at 4 * 32 */ + "ldm r12, {r10, r11}\n\t" + "subs r10, r10, %[b]\n\t" + "sbcs r11, r11, %[c]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r4\n\t" + "sbcs r11, r11, r5\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r6\n\t" + "sbcs r11, r11, r7\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r8\n\t" + "sbc r11, r11, r9\n\t" + "stm r12!, {r10, r11}\n\t" + "sub r12, r12, #36\n\t" + "asr lr, r11, #25\n\t" + /* Conditionally subtract order starting at bit 125 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa00000\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x0\n\t" +#else + "mov %[a], #0xa0000000\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[b], #0x4b\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0x9e\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0xba\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0x7d\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[b], #0xba\n\t" + "lsl %[b], %[b], #8\n\t" + "add %[b], %[b], #0x7d\n\t" +#else + "mov %[b], #0xba7d\n\t" +#endif + "movt %[b], #0x4b9e\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[c], #0xcb\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #2\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #0x4c\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #0x63\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[c], #0x4c\n\t" + "lsl %[c], %[c], #8\n\t" + "add %[c], %[c], #0x63\n\t" +#else + "mov %[c], #0x4c63\n\t" +#endif + "movt %[c], #0xcb02\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xd4\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x5e\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x9a\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "add r4, r4, #0x9a\n\t" +#else + "mov r4, #0xf39a\n\t" +#endif + "movt r4, #0xd45e\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #2\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0x9b\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #59\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "add r5, r5, #0x3b\n\t" +#else + "mov r5, #0xdf3b\n\t" +#endif + "movt r5, #0x29b\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r9, #0x20000\n\t" + "lsl r9, r9, #8\n\t" + "add r9, r9, #0x0\n\t" +#else + "mov r9, #0x2000000\n\t" +#endif + "and %[a], %[a], lr\n\t" + "and %[b], %[b], lr\n\t" + "and %[c], %[c], lr\n\t" + "and r4, r4, lr\n\t" + "and r5, r5, lr\n\t" + "and r9, r9, lr\n\t" + "ldm r12, {r10, r11}\n\t" + "adds r10, r10, %[a]\n\t" + "adcs r11, r11, %[b]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, %[c]\n\t" + "adcs r11, r11, r4\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r5\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10}\n\t" + "adcs r10, r10, #0\n\t" + "stm r12!, {r10}\n\t" + "sub %[s], %[s], #16\n\t" + "mov r12, sp\n\t" + /* Load bits 252-376 */ + "add r12, r12, #28\n\t" + "ldm r12, {%[a], %[b], %[c], r4, r5}\n\t" + "lsl r5, r5, #4\n\t" + "orr r5, r5, r4, lsr #28\n\t" + "lsl r4, r4, #4\n\t" + "orr r4, r4, %[c], lsr #28\n\t" + "lsl %[c], %[c], #4\n\t" + "orr %[c], %[c], %[b], lsr #28\n\t" + "lsl %[b], %[b], #4\n\t" + "orr %[b], %[b], %[a], lsr #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r5, r5, #0xe0000000\n\t" +#else + "bfc r5, #29, #3\n\t" +#endif + "sub r12, r12, #28\n\t" + /* Sub product of top 4 words and order */ + "mov %[s], sp\n\t" + /* * -5cf5d3ed */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa3\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #10\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #44\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x2c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x13\n\t" +#else + "mov %[a], #0x2c13\n\t" +#endif + "movt %[a], #0xa30a\n\t" +#endif + "mov lr, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, lr, %[b], %[a]\n\t" + "adds r7, r7, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r7, lr, %[c], %[a]\n\t" + "adds r8, r8, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r8, lr, r4, %[a]\n\t" + "adds r9, r9, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "umlal r9, lr, r5, %[a]\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -5812631b */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa7\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xed\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0xe5\n\t" +#else + "mov %[a], #0x9ce5\n\t" +#endif + "movt %[a], #0xa7ed\n\t" +#endif + "mov r10, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r10, %[b], %[a]\n\t" + "adds r7, r7, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r7, r10, %[c], %[a]\n\t" + "adds r8, r8, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r8, r10, r4, %[a]\n\t" + "adds r9, r9, r10\n\t" + "mov r10, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r5, %[a]\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -a2f79cd7 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x5d\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #8\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x29\n\t" +#else + "mov %[a], #0x6329\n\t" +#endif + "movt %[a], #0x5d08\n\t" +#endif + "mov r11, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r11, %[b], %[a]\n\t" + "adds r7, r7, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r7, r11, %[c], %[a]\n\t" + "adds r8, r8, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r8, r11, r4, %[a]\n\t" + "adds r9, r9, r11\n\t" + "mov r11, #0\n\t" + "adc r11, r11, #0\n\t" + "umlal r9, r11, r5, %[a]\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* * -14def9df */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xeb\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #6\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x6\n\t" + "lsl %[a], %[a], #8\n\t" + "add %[a], %[a], #0x21\n\t" +#else + "mov %[a], #0x621\n\t" +#endif + "movt %[a], #0xeb21\n\t" +#endif + "mov r12, #0\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" + "umlal r6, r12, %[b], %[a]\n\t" + "adds r7, r7, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r7, r12, %[c], %[a]\n\t" + "adds r8, r8, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r8, r12, r4, %[a]\n\t" + "adds r9, r9, r12\n\t" + "mov r12, #0\n\t" + "adc r12, r12, #0\n\t" + "umlal r9, r12, r5, %[a]\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" + /* Add overflows at 4 * 32 */ + "ldm %[s], {r6, r7, r8, r9}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + "adds r6, r6, lr\n\t" + "adcs r7, r7, r10\n\t" + "adcs r8, r8, r11\n\t" + "adc r9, r9, r12\n\t" + /* Subtract top at 4 * 32 */ + "subs r6, r6, %[b]\n\t" + "sbcs r7, r7, %[c]\n\t" + "sbcs r8, r8, r4\n\t" + "sbcs r9, r9, r5\n\t" + "sbc %[a], %[a], %[a]\n\t" + "sub %[s], %[s], #16\n\t" + "ldm %[s], {%[b], %[c], r4, r5}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5c\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xf5\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xed\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0xed\n\t" +#else + "mov r10, #0xd3ed\n\t" +#endif + "movt r10, #0x5cf5\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x58\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #18\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #26\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "add r11, r11, #0x1a\n\t" +#else + "mov r11, #0x631a\n\t" +#endif + "movt r11, #0x5812\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0xa2\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xf7\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xd6\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "add r12, r12, #0xd6\n\t" +#else + "mov r12, #0x9cd6\n\t" +#endif + "movt r12, #0xa2f7\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #20\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "add lr, lr, #0xde\n\t" +#else + "mov lr, #0xf9de\n\t" +#endif + "movt lr, #0x14de\n\t" +#endif + "and r10, r10, %[a]\n\t" + "and r11, r11, %[a]\n\t" + "and r12, r12, %[a]\n\t" + "and lr, lr, %[a]\n\t" + "adds %[b], %[b], r10\n\t" + "adcs %[c], %[c], r11\n\t" + "adcs r4, r4, r12\n\t" + "adcs r5, r5, lr\n\t" + "adcs r6, r6, #0\n\t" + "adcs r7, r7, #0\n\t" + "and %[a], %[a], #0x10000000\n\t" + "adcs r8, r8, #0\n\t" + "adc r9, r9, %[a]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif + "ldr %[s], [sp, #68]\n\t" + /* Store result */ + "str %[b], [%[s]]\n\t" + "str %[c], [%[s], #4]\n\t" + "str r4, [%[s], #8]\n\t" + "str r5, [%[s], #12]\n\t" + "str r6, [%[s], #16]\n\t" + "str r7, [%[s], #20]\n\t" + "str r8, [%[s], #24]\n\t" + "str r9, [%[s], #28]\n\t" + "add sp, sp, #0x50\n\t" + : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) +{ + register byte* s asm ("r0") = (byte*)s_p; + register const byte* a asm ("r1") = (const byte*)a_p; + register const byte* b asm ("r2") = (const byte*)b_p; + register const byte* c asm ("r3") = (const byte*)c_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x50\n\t" "add lr, sp, #0x44\n\t" "stm lr, {%[s], %[a], %[c]}\n\t" "mov lr, %[b]\n\t" @@ -4964,10 +8316,10 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[c], r12\n\t" "add lr, sp, #32\n\t" "stm lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" - "add %[s], sp, #0x50\n\t" + "mov %[s], sp\n\t" /* Add c to a * b */ "ldr lr, [sp, #76]\n\t" - "ldm sp!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" "ldm lr!, {%[a], r10, r11, r12}\n\t" "adds %[b], %[b], %[a]\n\t" "adcs %[c], %[c], r10\n\t" @@ -4979,8 +8331,8 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "adcs r8, r8, r11\n\t" "adcs r9, r9, r12\n\t" "mov %[a], r9\n\t" - "stm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" - "ldm sp, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "stm %[s]!, {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" + "ldm %[s], {%[b], %[c], r4, r5, r6, r7, r8, r9}\n\t" "adcs %[b], %[b], #0\n\t" "adcs %[c], %[c], #0\n\t" "adcs r4, r4, #0\n\t" @@ -4989,28 +8341,41 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "adcs r7, r7, #0\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, #0\n\t" - "sub sp, sp, #32\n\t" + "sub %[s], %[s], #32\n\t" /* Get 252..503 and 504..507 */ "lsr lr, r9, #24\n\t" - "bfc r9, #24, #8\n\t" "lsl r9, r9, #4\n\t" - "orr r9, r9, r8, lsr #28\n\t" + "orr r9, r9, r8, LSR #28\n\t" "lsl r8, r8, #4\n\t" - "orr r8, r8, r7, lsr #28\n\t" + "orr r8, r8, r7, LSR #28\n\t" "lsl r7, r7, #4\n\t" - "orr r7, r7, r6, lsr #28\n\t" + "orr r7, r7, r6, LSR #28\n\t" "lsl r6, r6, #4\n\t" - "orr r6, r6, r5, lsr #28\n\t" + "orr r6, r6, r5, LSR #28\n\t" "lsl r5, r5, #4\n\t" - "orr r5, r5, r4, lsr #28\n\t" + "orr r5, r5, r4, LSR #28\n\t" "lsl r4, r4, #4\n\t" - "orr r4, r4, %[c], lsr #28\n\t" + "orr r4, r4, %[c], LSR #28\n\t" "lsl %[c], %[c], #4\n\t" - "orr %[c], %[c], %[b], lsr #28\n\t" + "orr %[c], %[c], %[b], LSR #28\n\t" "lsl %[b], %[b], #4\n\t" - "orr %[b], %[b], %[a], lsr #28\n\t" + "orr %[b], %[b], %[a], LSR #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else + "bfc r9, #28, #4\n\t" +#endif /* Add order times bits 504..507 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0xa3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #10\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #44\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x2c\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0x13\n\t" @@ -5018,7 +8383,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r10, #0x2c13\n\t" #endif "movt r10, #0xa30a\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xa7\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xed\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x9c\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x9c\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0xe5\n\t" @@ -5026,10 +8401,20 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r11, #0x9ce5\n\t" #endif "movt r11, #0xa7ed\n\t" +#endif "mov %[a], #0\n\t" "umlal %[b], %[a], r10, lr\n\t" "umaal %[c], %[a], r11, lr\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5d\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #8\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0x63\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0x63\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0x29\n\t" @@ -5037,7 +8422,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r10, #0x6329\n\t" #endif "movt r10, #0x5d08\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0xeb\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #6\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x6\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0x21\n\t" @@ -5045,6 +8440,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r11, #0x621\n\t" #endif "movt r11, #0xeb21\n\t" +#endif "umaal r4, %[a], r10, lr\n\t" "umaal r5, %[a], r11, lr\n\t" "adds r6, r6, %[a]\n\t" @@ -5056,7 +8452,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "sbcs r8, r8, #0\n\t" "sbc r9, r9, #0\n\t" /* Sub product of top 8 words and order */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "mov r12, sp\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa3\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #10\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #44\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x2c\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x13\n\t" @@ -5064,25 +8470,41 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x2c13\n\t" #endif "movt %[a], #0xa30a\n\t" +#endif "mov lr, #0\n\t" - "ldm %[s]!, {r10, r11, r12}\n\t" + "ldm %[s]!, {r10, r11}\n\t" "umlal r10, lr, %[b], %[a]\n\t" "umaal r11, lr, %[c], %[a]\n\t" - "umaal r12, lr, r4, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm %[s]!, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, %[a]\n\t" - "umaal r11, lr, r6, %[a]\n\t" - "umaal r12, lr, r7, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r4, %[a]\n\t" + "umaal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm %[s]!, {r10, r11}\n\t" + "umaal r10, lr, r6, %[a]\n\t" + "umaal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" "ldm %[s]!, {r10, r11}\n\t" "umaal r10, lr, r8, %[a]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r11, r11, #0xf0000000\n\t" +#else "bfc r11, #28, #4\n\t" +#endif "umaal r11, lr, r9, %[a]\n\t" - "stm sp!, {r10, r11, lr}\n\t" + "stm r12!, {r10, r11, lr}\n\t" "sub %[s], %[s], #16\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa7\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xed\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x9c\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0xe5\n\t" @@ -5090,23 +8512,35 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x9ce5\n\t" #endif "movt %[a], #0xa7ed\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, %[b], %[a]\n\t" "umaal r11, lr, %[c], %[a]\n\t" - "umaal r12, lr, r4, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, %[a]\n\t" - "umaal r11, lr, r6, %[a]\n\t" - "umaal r12, lr, r7, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, %[a]\n\t" + "umaal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, %[a]\n\t" + "umaal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "umaal r10, lr, r8, %[a]\n\t" "umaal r11, lr, r9, %[a]\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x5d\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #8\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x63\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x29\n\t" @@ -5114,23 +8548,35 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x6329\n\t" #endif "movt %[a], #0x5d08\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, %[b], %[a]\n\t" "umaal r11, lr, %[c], %[a]\n\t" - "umaal r12, lr, r4, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, %[a]\n\t" - "umaal r11, lr, r6, %[a]\n\t" - "umaal r12, lr, r7, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, %[a]\n\t" + "umaal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, %[a]\n\t" + "umaal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "umaal r10, lr, r8, %[a]\n\t" "umaal r11, lr, r9, %[a]\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xeb\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #6\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x6\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x21\n\t" @@ -5138,48 +8584,62 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x621\n\t" #endif "movt %[a], #0xeb21\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "umlal r10, lr, %[b], %[a]\n\t" "umaal r11, lr, %[c], %[a]\n\t" - "umaal r12, lr, r4, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "umaal r10, lr, r5, %[a]\n\t" - "umaal r11, lr, r6, %[a]\n\t" - "umaal r12, lr, r7, %[a]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r4, %[a]\n\t" + "umaal r11, lr, r5, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "umaal r10, lr, r6, %[a]\n\t" + "umaal r11, lr, r7, %[a]\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "umaal r10, lr, r8, %[a]\n\t" "umaal r11, lr, r9, %[a]\n\t" - "stm sp!, {r10, r11, lr}\n\t" - "sub sp, sp, #32\n\t" + "stm r12!, {r10, r11, lr}\n\t" + "sub r12, r12, #32\n\t" /* Subtract at 4 * 32 */ - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "subs r10, r10, %[b]\n\t" "sbcs r11, r11, %[c]\n\t" - "sbcs r12, r12, r4\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "sbcs r10, r10, r5\n\t" - "sbcs r11, r11, r6\n\t" - "sbcs r12, r12, r7\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r4\n\t" + "sbcs r11, r11, r5\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "sbcs r10, r10, r6\n\t" + "sbcs r11, r11, r7\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "sbcs r10, r10, r8\n\t" "sbc r11, r11, r9\n\t" - "stm sp!, {r10, r11}\n\t" - "sub sp, sp, #36\n\t" + "stm r12!, {r10, r11}\n\t" + "sub r12, r12, #36\n\t" "asr lr, r11, #25\n\t" /* Conditionally subtract order starting at bit 125 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0xa00000\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x0\n\t" #else "mov %[a], #0xa0000000\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[b], #0x4b\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0x9e\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0xba\n\t" + "lsl %[b], %[b], #8\n\t" + "orr %[b], %[b], #0x7d\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[b], #0xba\n\t" "lsl %[b], %[b], #8\n\t" "add %[b], %[b], #0x7d\n\t" @@ -5187,7 +8647,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[b], #0xba7d\n\t" #endif "movt %[b], #0x4b9e\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[c], #0xcb\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #2\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #0x4c\n\t" + "lsl %[c], %[c], #8\n\t" + "orr %[c], %[c], #0x63\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[c], #0x4c\n\t" "lsl %[c], %[c], #8\n\t" "add %[c], %[c], #0x63\n\t" @@ -5195,7 +8665,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[c], #0x4c63\n\t" #endif "movt %[c], #0xcb02\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r4, #0xd4\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x5e\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0xf3\n\t" + "lsl r4, r4, #8\n\t" + "orr r4, r4, #0x9a\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r4, #0xf3\n\t" "lsl r4, r4, #8\n\t" "add r4, r4, #0x9a\n\t" @@ -5203,7 +8683,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r4, #0xf39a\n\t" #endif "movt r4, #0xd45e\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r5, #2\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0x9b\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #0xdf\n\t" + "lsl r5, r5, #8\n\t" + "orr r5, r5, #59\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r5, #0xdf\n\t" "lsl r5, r5, #8\n\t" "add r5, r5, #0x3b\n\t" @@ -5211,7 +8701,8 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r5, #0xdf3b\n\t" #endif "movt r5, #0x29b\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r9, #0x20000\n\t" "lsl r9, r9, #8\n\t" "add r9, r9, #0x0\n\t" @@ -5224,26 +8715,30 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "and r4, r4, lr\n\t" "and r5, r5, lr\n\t" "and r9, r9, lr\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "ldm r12, {r10, r11}\n\t" "adds r10, r10, %[a]\n\t" "adcs r11, r11, %[b]\n\t" - "adcs r12, r12, %[c]\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" - "adcs r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adcs r12, r12, #0\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "ldm sp, {r10, r11, r12}\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, %[c]\n\t" + "adcs r11, r11, r4\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" + "adcs r10, r10, r5\n\t" + "adcs r11, r11, #0\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10, r11}\n\t" "adcs r10, r10, #0\n\t" "adcs r11, r11, #0\n\t" - "adcs r12, r12, r9\n\t" - "stm sp!, {r10, r11, r12}\n\t" - "sub sp, sp, #48\n\t" + "stm r12!, {r10, r11}\n\t" + "ldm r12, {r10}\n\t" + "adcs r10, r10, #0\n\t" + "stm r12!, {r10}\n\t" "sub %[s], %[s], #16\n\t" + "mov r12, sp\n\t" /* Load bits 252-376 */ - "add sp, sp, #28\n\t" - "ldm sp, {%[a], %[b], %[c], r4, r5}\n\t" + "add r12, r12, #28\n\t" + "ldm r12, {%[a], %[b], %[c], r4, r5}\n\t" "lsl r5, r5, #4\n\t" "orr r5, r5, r4, lsr #28\n\t" "lsl r4, r4, #4\n\t" @@ -5252,11 +8747,25 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "orr %[c], %[c], %[b], lsr #28\n\t" "lsl %[b], %[b], #4\n\t" "orr %[b], %[b], %[a], lsr #28\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r5, r5, #0xe0000000\n\t" +#else "bfc r5, #29, #3\n\t" - "sub sp, sp, #28\n\t" - /* Sub product of top 8 words and order */ +#endif + "sub r12, r12, #28\n\t" + /* Sub product of top 4 words and order */ + "mov %[s], sp\n\t" /* * -5cf5d3ed */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa3\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #10\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #44\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #19\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x2c\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x13\n\t" @@ -5264,16 +8773,26 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x2c13\n\t" #endif "movt %[a], #0xa30a\n\t" +#endif "mov lr, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, lr, %[b], %[a]\n\t" "umaal r7, lr, %[c], %[a]\n\t" "umaal r8, lr, r4, %[a]\n\t" "umaal r9, lr, r5, %[a]\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -5812631b */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xa7\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xed\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x9c\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0xe5\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x9c\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0xe5\n\t" @@ -5281,16 +8800,26 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x9ce5\n\t" #endif "movt %[a], #0xa7ed\n\t" +#endif "mov r10, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r10, %[b], %[a]\n\t" "umaal r7, r10, %[c], %[a]\n\t" "umaal r8, r10, r4, %[a]\n\t" "umaal r9, r10, r5, %[a]\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -a2f79cd7 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0x5d\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #8\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #0x63\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #41\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x63\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x29\n\t" @@ -5298,16 +8827,26 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x6329\n\t" #endif "movt %[a], #0x5d08\n\t" +#endif "mov r11, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r11, %[b], %[a]\n\t" "umaal r7, r11, %[c], %[a]\n\t" "umaal r8, r11, r4, %[a]\n\t" "umaal r9, r11, r5, %[a]\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* * -14def9df */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov %[a], #0xeb\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #6\n\t" + "lsl %[a], %[a], #8\n\t" + "orr %[a], %[a], #33\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov %[a], #0x6\n\t" "lsl %[a], %[a], #8\n\t" "add %[a], %[a], #0x21\n\t" @@ -5315,17 +8854,22 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov %[a], #0x621\n\t" #endif "movt %[a], #0xeb21\n\t" +#endif "mov r12, #0\n\t" - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" "umlal r6, r12, %[b], %[a]\n\t" "umaal r7, r12, %[c], %[a]\n\t" "umaal r8, r12, r4, %[a]\n\t" "umaal r9, r12, r5, %[a]\n\t" - "stm sp, {r6, r7, r8, r9}\n\t" - "add sp, sp, #4\n\t" + "stm %[s], {r6, r7, r8, r9}\n\t" + "add %[s], %[s], #4\n\t" /* Add overflows at 4 * 32 */ - "ldm sp, {r6, r7, r8, r9}\n\t" + "ldm %[s], {r6, r7, r8, r9}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else "bfc r9, #28, #4\n\t" +#endif "adds r6, r6, lr\n\t" "adcs r7, r7, r10\n\t" "adcs r8, r8, r11\n\t" @@ -5336,9 +8880,18 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "sbcs r8, r8, r4\n\t" "sbcs r9, r9, r5\n\t" "sbc %[a], %[a], %[a]\n\t" - "sub sp, sp, #16\n\t" - "ldm sp, {%[b], %[c], r4, r5}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub %[s], %[s], #16\n\t" + "ldm %[s], {%[b], %[c], r4, r5}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x5c\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xf5\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xd3\n\t" + "lsl r10, r10, #8\n\t" + "orr r10, r10, #0xed\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r10, #0xd3\n\t" "lsl r10, r10, #8\n\t" "add r10, r10, #0xed\n\t" @@ -5346,7 +8899,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r10, #0xd3ed\n\t" #endif "movt r10, #0x5cf5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r11, #0x58\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #18\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #0x63\n\t" + "lsl r11, r11, #8\n\t" + "orr r11, r11, #26\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r11, #0x63\n\t" "lsl r11, r11, #8\n\t" "add r11, r11, #0x1a\n\t" @@ -5354,7 +8917,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r11, #0x631a\n\t" #endif "movt r11, #0x5812\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r12, #0xa2\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xf7\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0x9c\n\t" + "lsl r12, r12, #8\n\t" + "orr r12, r12, #0xd6\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r12, #0x9c\n\t" "lsl r12, r12, #8\n\t" "add r12, r12, #0xd6\n\t" @@ -5362,7 +8935,17 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov r12, #0x9cd6\n\t" #endif "movt r12, #0xa2f7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov lr, #20\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xf9\n\t" + "lsl lr, lr, #8\n\t" + "orr lr, lr, #0xde\n\t" +#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov lr, #0xf9\n\t" "lsl lr, lr, #8\n\t" "add lr, lr, #0xde\n\t" @@ -5370,6 +8953,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "mov lr, #0xf9de\n\t" #endif "movt lr, #0x14de\n\t" +#endif "and r10, r10, %[a]\n\t" "and r11, r11, %[a]\n\t" "and r12, r12, %[a]\n\t" @@ -5383,7 +8967,11 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "and %[a], %[a], #0x10000000\n\t" "adcs r8, r8, #0\n\t" "adc r9, r9, %[a]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "bic r9, r9, #0xf0000000\n\t" +#else "bfc r9, #28, #4\n\t" +#endif "ldr %[s], [sp, #68]\n\t" /* Store result */ "str %[b], [%[s]]\n\t" @@ -5394,13 +8982,14 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "str r7, [%[s], #20]\n\t" "str r8, [%[s], #24]\n\t" "str r9, [%[s], #28]\n\t" - "add sp, sp, #0x70\n\t" + "add sp, sp, #0x50\n\t" : [s] "+r" (s), [a] "+r" (a), [b] "+r" (b), [c] "+r" (c) : : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ @@ -5408,4 +8997,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) #endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S index d2715c6de..88799443f 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm.S @@ -112,49 +112,49 @@ Transform_Sha256_Len: sub sp, sp, #0xc0 adr r3, L_SHA256_transform_len_k # Copy digest to add in at end -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r0, #24] ldr r11, [r0, #28] #else ldrd r10, r11, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #64] str r5, [sp, #68] #else strd r4, r5, [sp, #64] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #72] str r7, [sp, #76] #else strd r6, r7, [sp, #72] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #80] str r9, [sp, #84] #else strd r8, r9, [sp, #80] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [sp, #88] str r11, [sp, #92] #else @@ -163,6 +163,136 @@ Transform_Sha256_Len: # Start of loop processing a block L_SHA256_transform_len_begin: # Load, Reverse and Store W - 64 bytes +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp] + str r5, [sp, #4] +#else + strd r4, r5, [sp] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #8] + str r7, [sp, #12] +#else + strd r6, r7, [sp, #8] +#endif + ldr r4, [r1, #16] + ldr r5, [r1, #20] + ldr r6, [r1, #24] + ldr r7, [r1, #28] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #16] + str r5, [sp, #20] +#else + strd r4, r5, [sp, #16] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #24] + str r7, [sp, #28] +#else + strd r6, r7, [sp, #24] +#endif + ldr r4, [r1, #32] + ldr r5, [r1, #36] + ldr r6, [r1, #40] + ldr r7, [r1, #44] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #32] + str r5, [sp, #36] +#else + strd r4, r5, [sp, #32] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #40] + str r7, [sp, #44] +#else + strd r6, r7, [sp, #40] +#endif + ldr r4, [r1, #48] + ldr r5, [r1, #52] + ldr r6, [r1, #56] + ldr r7, [r1, #60] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r4, [sp, #48] + str r5, [sp, #52] +#else + strd r4, r5, [sp, #48] +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + str r6, [sp, #56] + str r7, [sp, #60] +#else + strd r6, r7, [sp, #56] +#endif +#else ldr r4, [r1] ldr r5, [r1, #4] ldr r6, [r1, #8] @@ -179,25 +309,25 @@ L_SHA256_transform_len_begin: rev r9, r9 rev r10, r10 rev r11, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp] str r5, [sp, #4] #else strd r4, r5, [sp] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #8] str r7, [sp, #12] #else strd r6, r7, [sp, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #16] str r9, [sp, #20] #else strd r8, r9, [sp, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [sp, #24] str r11, [sp, #28] #else @@ -219,30 +349,31 @@ L_SHA256_transform_len_begin: rev r9, r9 rev r10, r10 rev r11, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #32] str r5, [sp, #36] #else strd r4, r5, [sp, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #40] str r7, [sp, #44] #else strd r6, r7, [sp, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #48] str r9, [sp, #52] #else strd r8, r9, [sp, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [sp, #56] str r11, [sp, #60] #else strd r10, r11, [sp, #56] #endif +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ ldr r11, [r0, #4] ldr r4, [r0, #8] eor r11, r11, r4 @@ -1517,25 +1648,25 @@ L_SHA256_transform_len_start: str r8, [r0, #16] str r9, [r0] # Add in digest from start -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #64] ldr r9, [sp, #68] #else ldrd r8, r9, [sp, #64] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #72] ldr r11, [sp, #76] #else @@ -1545,49 +1676,49 @@ L_SHA256_transform_len_start: add r5, r5, r9 add r6, r6, r10 add r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else strd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #64] str r5, [sp, #68] #else strd r4, r5, [sp, #64] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #72] str r7, [sp, #76] #else strd r6, r7, [sp, #72] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #80] ldr r9, [sp, #84] #else ldrd r8, r9, [sp, #80] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #88] ldr r11, [sp, #92] #else @@ -1597,25 +1728,25 @@ L_SHA256_transform_len_start: add r5, r5, r9 add r6, r6, r10 add r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else strd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #80] str r5, [sp, #84] #else strd r4, r5, [sp, #80] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #88] str r7, [sp, #92] #else @@ -1708,7 +1839,7 @@ Transform_Sha256_Len: push {r4, r5, r6, r7, r8, r9, r10, lr} vpush {d8-d11} sub sp, sp, #24 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r0, [sp] str r1, [sp, #4] #else @@ -1717,25 +1848,25 @@ Transform_Sha256_Len: str r2, [sp, #8] adr r12, L_SHA256_transform_neon_len_k # Load digest into registers -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r2, [r0] ldr r3, [r0, #4] #else ldrd r2, r3, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -2666,7 +2797,7 @@ L_SHA256_transform_neon_len_start: add r2, r2, r1 ldr r10, [sp] # Add in digest from start -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r0, [r10] ldr r1, [r10, #4] #else @@ -2674,13 +2805,13 @@ L_SHA256_transform_neon_len_start: #endif add r2, r2, r0 add r3, r3, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r2, [r10] str r3, [r10, #4] #else strd r2, r3, [r10] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r0, [r10, #8] ldr r1, [r10, #12] #else @@ -2688,13 +2819,13 @@ L_SHA256_transform_neon_len_start: #endif add r4, r4, r0 add r5, r5, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r10, #8] str r5, [r10, #12] #else strd r4, r5, [r10, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r0, [r10, #16] ldr r1, [r10, #20] #else @@ -2702,13 +2833,13 @@ L_SHA256_transform_neon_len_start: #endif add r6, r6, r0 add r7, r7, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r10, #16] str r7, [r10, #20] #else strd r6, r7, [r10, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r0, [r10, #24] ldr r1, [r10, #28] #else @@ -2716,7 +2847,7 @@ L_SHA256_transform_neon_len_start: #endif add r8, r8, r0 add r9, r9, r1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r10, #24] str r9, [r10, #28] #else diff --git a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c index 49301d7dc..916d8a6a1 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha256-asm_c.c @@ -39,6 +39,18 @@ #include #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifndef NO_SHA256 #include @@ -73,49 +85,49 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" /* Copy digest to add in at end */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha256]]\n\t" "ldr r5, [%[sha256], #4]\n\t" #else "ldrd r4, r5, [%[sha256]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha256], #8]\n\t" "ldr r7, [%[sha256], #12]\n\t" #else "ldrd r6, r7, [%[sha256], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha256], #16]\n\t" "ldr r9, [%[sha256], #20]\n\t" #else "ldrd r8, r9, [%[sha256], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[sha256], #24]\n\t" "ldr r11, [%[sha256], #28]\n\t" #else "ldrd r10, r11, [%[sha256], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #64]\n\t" "str r5, [sp, #68]\n\t" #else "strd r4, r5, [sp, #64]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #72]\n\t" "str r7, [sp, #76]\n\t" #else "strd r6, r7, [sp, #72]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #80]\n\t" "str r9, [sp, #84]\n\t" #else "strd r8, r9, [sp, #80]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [sp, #88]\n\t" "str r11, [sp, #92]\n\t" #else @@ -125,6 +137,136 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "\n" "L_SHA256_transform_len_begin_%=: \n\t" /* Load, Reverse and Store W - 64 bytes */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "ldr r4, [%[data]]\n\t" + "ldr r5, [%[data], #4]\n\t" + "ldr r6, [%[data], #8]\n\t" + "ldr r7, [%[data], #12]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp]\n\t" + "str r5, [sp, #4]\n\t" +#else + "strd r4, r5, [sp]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #8]\n\t" + "str r7, [sp, #12]\n\t" +#else + "strd r6, r7, [sp, #8]\n\t" +#endif + "ldr r4, [%[data], #16]\n\t" + "ldr r5, [%[data], #20]\n\t" + "ldr r6, [%[data], #24]\n\t" + "ldr r7, [%[data], #28]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #16]\n\t" + "str r5, [sp, #20]\n\t" +#else + "strd r4, r5, [sp, #16]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #24]\n\t" + "str r7, [sp, #28]\n\t" +#else + "strd r6, r7, [sp, #24]\n\t" +#endif + "ldr r4, [%[data], #32]\n\t" + "ldr r5, [%[data], #36]\n\t" + "ldr r6, [%[data], #40]\n\t" + "ldr r7, [%[data], #44]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #32]\n\t" + "str r5, [sp, #36]\n\t" +#else + "strd r4, r5, [sp, #32]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #40]\n\t" + "str r7, [sp, #44]\n\t" +#else + "strd r6, r7, [sp, #40]\n\t" +#endif + "ldr r4, [%[data], #48]\n\t" + "ldr r5, [%[data], #52]\n\t" + "ldr r6, [%[data], #56]\n\t" + "ldr r7, [%[data], #60]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r4, [sp, #48]\n\t" + "str r5, [sp, #52]\n\t" +#else + "strd r4, r5, [sp, #48]\n\t" +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r6, [sp, #56]\n\t" + "str r7, [sp, #60]\n\t" +#else + "strd r6, r7, [sp, #56]\n\t" +#endif +#else "ldr r4, [%[data]]\n\t" "ldr r5, [%[data], #4]\n\t" "ldr r6, [%[data], #8]\n\t" @@ -141,25 +283,25 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "rev r9, r9\n\t" "rev r10, r10\n\t" "rev r11, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp]\n\t" "str r5, [sp, #4]\n\t" #else "strd r4, r5, [sp]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #8]\n\t" "str r7, [sp, #12]\n\t" #else "strd r6, r7, [sp, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #16]\n\t" "str r9, [sp, #20]\n\t" #else "strd r8, r9, [sp, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [sp, #24]\n\t" "str r11, [sp, #28]\n\t" #else @@ -181,30 +323,31 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "rev r9, r9\n\t" "rev r10, r10\n\t" "rev r11, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #32]\n\t" "str r5, [sp, #36]\n\t" #else "strd r4, r5, [sp, #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #40]\n\t" "str r7, [sp, #44]\n\t" #else "strd r6, r7, [sp, #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #48]\n\t" "str r9, [sp, #52]\n\t" #else "strd r8, r9, [sp, #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [sp, #56]\n\t" "str r11, [sp, #60]\n\t" #else "strd r10, r11, [sp, #56]\n\t" #endif +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "ldr r11, [%[sha256], #4]\n\t" "ldr r4, [%[sha256], #8]\n\t" "eor r11, r11, r4\n\t" @@ -1480,25 +1623,25 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "str r8, [%[sha256], #16]\n\t" "str r9, [%[sha256]]\n\t" /* Add in digest from start */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha256]]\n\t" "ldr r5, [%[sha256], #4]\n\t" #else "ldrd r4, r5, [%[sha256]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha256], #8]\n\t" "ldr r7, [%[sha256], #12]\n\t" #else "ldrd r6, r7, [%[sha256], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #64]\n\t" "ldr r9, [sp, #68]\n\t" #else "ldrd r8, r9, [sp, #64]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #72]\n\t" "ldr r11, [sp, #76]\n\t" #else @@ -1508,49 +1651,49 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "add r5, r5, r9\n\t" "add r6, r6, r10\n\t" "add r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha256]]\n\t" "str r5, [%[sha256], #4]\n\t" #else "strd r4, r5, [%[sha256]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha256], #8]\n\t" "str r7, [%[sha256], #12]\n\t" #else "strd r6, r7, [%[sha256], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #64]\n\t" "str r5, [sp, #68]\n\t" #else "strd r4, r5, [sp, #64]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #72]\n\t" "str r7, [sp, #76]\n\t" #else "strd r6, r7, [sp, #72]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha256], #16]\n\t" "ldr r5, [%[sha256], #20]\n\t" #else "ldrd r4, r5, [%[sha256], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha256], #24]\n\t" "ldr r7, [%[sha256], #28]\n\t" #else "ldrd r6, r7, [%[sha256], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #80]\n\t" "ldr r9, [sp, #84]\n\t" #else "ldrd r8, r9, [sp, #80]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #88]\n\t" "ldr r11, [sp, #92]\n\t" #else @@ -1560,25 +1703,25 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "add r5, r5, r9\n\t" "add r6, r6, r10\n\t" "add r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha256], #16]\n\t" "str r5, [%[sha256], #20]\n\t" #else "strd r4, r5, [%[sha256], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha256], #24]\n\t" "str r7, [%[sha256], #28]\n\t" #else "strd r6, r7, [%[sha256], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #80]\n\t" "str r5, [sp, #84]\n\t" #else "strd r4, r5, [sp, #80]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #88]\n\t" "str r7, [sp, #92]\n\t" #else @@ -1628,7 +1771,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) __asm__ __volatile__ ( "sub sp, sp, #24\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str %[sha256], [sp]\n\t" "str %[data], [sp, #4]\n\t" #else @@ -1637,25 +1780,25 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "str %[len], [sp, #8]\n\t" "mov r12, %[L_SHA256_transform_neon_len_k]\n\t" /* Load digest into registers */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[len], [%[sha256]]\n\t" "ldr r3, [%[sha256], #4]\n\t" #else "ldrd %[len], r3, [%[sha256]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha256], #8]\n\t" "ldr r5, [%[sha256], #12]\n\t" #else "ldrd r4, r5, [%[sha256], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha256], #16]\n\t" "ldr r7, [%[sha256], #20]\n\t" #else "ldrd r6, r7, [%[sha256], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha256], #24]\n\t" "ldr r9, [%[sha256], #28]\n\t" #else @@ -2588,7 +2731,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "add %[len], %[len], %[data]\n\t" "ldr r10, [sp]\n\t" /* Add in digest from start */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[sha256], [r10]\n\t" "ldr %[data], [r10, #4]\n\t" #else @@ -2596,13 +2739,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif "add %[len], %[len], %[sha256]\n\t" "add r3, r3, %[data]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str %[len], [r10]\n\t" "str r3, [r10, #4]\n\t" #else "strd %[len], r3, [r10]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[sha256], [r10, #8]\n\t" "ldr %[data], [r10, #12]\n\t" #else @@ -2610,13 +2753,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif "add r4, r4, %[sha256]\n\t" "add r5, r5, %[data]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [r10, #8]\n\t" "str r5, [r10, #12]\n\t" #else "strd r4, r5, [r10, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[sha256], [r10, #16]\n\t" "ldr %[data], [r10, #20]\n\t" #else @@ -2624,13 +2767,13 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif "add r6, r6, %[sha256]\n\t" "add r7, r7, %[data]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [r10, #16]\n\t" "str r7, [r10, #20]\n\t" #else "strd r6, r7, [r10, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[sha256], [r10, #24]\n\t" "ldr %[data], [r10, #28]\n\t" #else @@ -2638,7 +2781,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif "add r8, r8, %[sha256]\n\t" "add r9, r9, %[data]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [r10, #24]\n\t" "str r9, [r10, #28]\n\t" #else @@ -2661,4 +2804,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif /* !NO_SHA256 */ #endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S index ba50a88b8..f06ae9284 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm.S @@ -208,97 +208,97 @@ Transform_Sha512_Len: sub sp, sp, #0xc0 adr r3, L_SHA512_transform_len_k # Copy digest to add in at end -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r0, #24] ldr r11, [r0, #28] #else ldrd r10, r11, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #128] str r5, [sp, #132] #else strd r4, r5, [sp, #128] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #136] str r7, [sp, #140] #else strd r6, r7, [sp, #136] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #144] str r9, [sp, #148] #else strd r8, r9, [sp, #144] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [sp, #152] str r11, [sp, #156] #else strd r10, r11, [sp, #152] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else ldrd r8, r9, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r0, #56] ldr r11, [r0, #60] #else ldrd r10, r11, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #160] str r5, [sp, #164] #else strd r4, r5, [sp, #160] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #168] str r7, [sp, #172] #else strd r6, r7, [sp, #168] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [sp, #176] str r9, [sp, #180] #else strd r8, r9, [sp, #176] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r10, [sp, #184] str r11, [sp, #188] #else @@ -306,7 +306,201 @@ Transform_Sha512_Len: #endif # Start of loop processing a block L_SHA512_transform_len_begin: - # Load, Reverse and Store W + # Load, Reverse and Store W - 64 bytes +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + ldr r4, [r1] + ldr r5, [r1, #4] + ldr r6, [r1, #8] + ldr r7, [r1, #12] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp] + str r4, [sp, #4] + str r7, [sp, #8] + str r6, [sp, #12] + ldr r4, [r1, #16] + ldr r5, [r1, #20] + ldr r6, [r1, #24] + ldr r7, [r1, #28] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #16] + str r4, [sp, #20] + str r7, [sp, #24] + str r6, [sp, #28] + ldr r4, [r1, #32] + ldr r5, [r1, #36] + ldr r6, [r1, #40] + ldr r7, [r1, #44] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #32] + str r4, [sp, #36] + str r7, [sp, #40] + str r6, [sp, #44] + ldr r4, [r1, #48] + ldr r5, [r1, #52] + ldr r6, [r1, #56] + ldr r7, [r1, #60] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #48] + str r4, [sp, #52] + str r7, [sp, #56] + str r6, [sp, #60] + ldr r4, [r1, #64] + ldr r5, [r1, #68] + ldr r6, [r1, #72] + ldr r7, [r1, #76] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #64] + str r4, [sp, #68] + str r7, [sp, #72] + str r6, [sp, #76] + ldr r4, [r1, #80] + ldr r5, [r1, #84] + ldr r6, [r1, #88] + ldr r7, [r1, #92] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #80] + str r4, [sp, #84] + str r7, [sp, #88] + str r6, [sp, #92] + ldr r4, [r1, #96] + ldr r5, [r1, #100] + ldr r6, [r1, #104] + ldr r7, [r1, #108] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #96] + str r4, [sp, #100] + str r7, [sp, #104] + str r6, [sp, #108] + ldr r4, [r1, #112] + ldr r5, [r1, #116] + ldr r6, [r1, #120] + ldr r7, [r1, #124] + eor r8, r4, r4, ror #16 + eor r9, r5, r5, ror #16 + eor r10, r6, r6, ror #16 + eor r11, r7, r7, ror #16 + bic r8, r8, #0xff0000 + bic r9, r9, #0xff0000 + bic r10, r10, #0xff0000 + bic r11, r11, #0xff0000 + ror r4, r4, #8 + ror r5, r5, #8 + ror r6, r6, #8 + ror r7, r7, #8 + eor r4, r4, r8, lsr #8 + eor r5, r5, r9, lsr #8 + eor r6, r6, r10, lsr #8 + eor r7, r7, r11, lsr #8 + str r5, [sp, #112] + str r4, [sp, #116] + str r7, [sp, #120] + str r6, [sp, #124] +#else ldr r4, [r1] ldr r5, [r1, #4] ldr r6, [r1, #8] @@ -403,14 +597,15 @@ L_SHA512_transform_len_begin: str r8, [sp, #116] str r11, [sp, #120] str r10, [sp, #124] +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ # Pre-calc: b ^ c -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [r0, #8] ldr r11, [r0, #12] #else ldrd r10, r11, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -422,7 +617,7 @@ L_SHA512_transform_len_begin: # Start of 16 rounds L_SHA512_transform_len_start: # Round 0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -442,7 +637,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -452,25 +647,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else strd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -482,13 +677,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp] ldr r9, [sp, #4] #else @@ -496,7 +691,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3] ldr r7, [r3, #4] #else @@ -504,7 +699,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -512,7 +707,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -520,13 +715,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else @@ -546,7 +741,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -556,19 +751,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else ldrd r8, r9, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -580,7 +775,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else @@ -588,7 +783,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else @@ -597,7 +792,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[0] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #112] ldr r5, [sp, #116] #else @@ -618,13 +813,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp] ldr r5, [sp, #4] #else ldrd r4, r5, [sp] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #72] ldr r9, [sp, #76] #else @@ -634,13 +829,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp] str r5, [sp, #4] #else strd r4, r5, [sp] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #8] ldr r5, [sp, #12] #else @@ -661,7 +856,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp] ldr r5, [sp, #4] #else @@ -669,14 +864,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp] str r5, [sp, #4] #else strd r4, r5, [sp] #endif # Round 1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -696,7 +891,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -706,25 +901,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else strd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -736,13 +931,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #8] ldr r9, [sp, #12] #else @@ -750,7 +945,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #8] ldr r7, [r3, #12] #else @@ -758,7 +953,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -766,7 +961,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -774,13 +969,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #16] str r9, [r0, #20] #else @@ -800,7 +995,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -810,19 +1005,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else ldrd r8, r9, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -834,7 +1029,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else @@ -842,7 +1037,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #48] str r7, [r0, #52] #else @@ -851,7 +1046,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[1] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #120] ldr r5, [sp, #124] #else @@ -872,13 +1067,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #8] ldr r5, [sp, #12] #else ldrd r4, r5, [sp, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #80] ldr r9, [sp, #84] #else @@ -888,13 +1083,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #8] str r5, [sp, #12] #else strd r4, r5, [sp, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #16] ldr r5, [sp, #20] #else @@ -915,7 +1110,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #8] ldr r5, [sp, #12] #else @@ -923,14 +1118,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #8] str r5, [sp, #12] #else strd r4, r5, [sp, #8] #endif # Round 2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -950,7 +1145,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -960,25 +1155,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else strd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -990,13 +1185,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #16] ldr r9, [sp, #20] #else @@ -1004,7 +1199,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #16] ldr r7, [r3, #20] #else @@ -1012,7 +1207,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -1020,7 +1215,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -1028,13 +1223,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #8] str r9, [r0, #12] #else @@ -1054,7 +1249,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -1064,19 +1259,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else ldrd r8, r9, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -1088,7 +1283,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else @@ -1096,7 +1291,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else @@ -1105,7 +1300,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[2] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp] ldr r5, [sp, #4] #else @@ -1126,13 +1321,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #16] ldr r5, [sp, #20] #else ldrd r4, r5, [sp, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #88] ldr r9, [sp, #92] #else @@ -1142,13 +1337,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #16] str r5, [sp, #20] #else strd r4, r5, [sp, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #24] ldr r5, [sp, #28] #else @@ -1169,7 +1364,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #16] ldr r5, [sp, #20] #else @@ -1177,14 +1372,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #16] str r5, [sp, #20] #else strd r4, r5, [sp, #16] #endif # Round 3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -1204,7 +1399,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -1214,25 +1409,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else strd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -1244,13 +1439,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #24] ldr r9, [sp, #28] #else @@ -1258,7 +1453,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #24] ldr r7, [r3, #28] #else @@ -1266,7 +1461,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -1274,7 +1469,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -1282,13 +1477,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0] str r9, [r0, #4] #else @@ -1308,7 +1503,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -1318,19 +1513,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else ldrd r8, r9, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -1342,7 +1537,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else @@ -1350,7 +1545,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #32] str r7, [r0, #36] #else @@ -1359,7 +1554,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[3] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #8] ldr r5, [sp, #12] #else @@ -1380,13 +1575,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #24] ldr r5, [sp, #28] #else ldrd r4, r5, [sp, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #96] ldr r9, [sp, #100] #else @@ -1396,13 +1591,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #24] str r5, [sp, #28] #else strd r4, r5, [sp, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #32] ldr r5, [sp, #36] #else @@ -1423,7 +1618,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #24] ldr r5, [sp, #28] #else @@ -1431,14 +1626,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #24] str r5, [sp, #28] #else strd r4, r5, [sp, #24] #endif # Round 4 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -1458,7 +1653,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -1468,25 +1663,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else strd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -1498,13 +1693,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #32] ldr r9, [sp, #36] #else @@ -1512,7 +1707,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #32] ldr r7, [r3, #36] #else @@ -1520,7 +1715,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -1528,7 +1723,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -1536,13 +1731,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #56] str r9, [r0, #60] #else @@ -1562,7 +1757,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -1572,19 +1767,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else ldrd r8, r9, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -1596,7 +1791,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else @@ -1604,7 +1799,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else @@ -1613,7 +1808,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[4] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #16] ldr r5, [sp, #20] #else @@ -1634,13 +1829,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #32] ldr r5, [sp, #36] #else ldrd r4, r5, [sp, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #104] ldr r9, [sp, #108] #else @@ -1650,13 +1845,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #32] str r5, [sp, #36] #else strd r4, r5, [sp, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #40] ldr r5, [sp, #44] #else @@ -1677,7 +1872,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #32] ldr r5, [sp, #36] #else @@ -1685,14 +1880,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #32] str r5, [sp, #36] #else strd r4, r5, [sp, #32] #endif # Round 5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -1712,7 +1907,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -1722,25 +1917,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -1752,13 +1947,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #40] ldr r9, [sp, #44] #else @@ -1766,7 +1961,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #40] ldr r7, [r3, #44] #else @@ -1774,7 +1969,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -1782,7 +1977,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -1790,13 +1985,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #48] str r9, [r0, #52] #else @@ -1816,7 +2011,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -1826,19 +2021,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else ldrd r8, r9, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -1850,7 +2045,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else @@ -1858,7 +2053,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #16] str r7, [r0, #20] #else @@ -1867,7 +2062,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[5] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #24] ldr r5, [sp, #28] #else @@ -1888,13 +2083,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #40] ldr r5, [sp, #44] #else ldrd r4, r5, [sp, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #112] ldr r9, [sp, #116] #else @@ -1904,13 +2099,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #40] str r5, [sp, #44] #else strd r4, r5, [sp, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #48] ldr r5, [sp, #52] #else @@ -1931,7 +2126,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #40] ldr r5, [sp, #44] #else @@ -1939,14 +2134,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #40] str r5, [sp, #44] #else strd r4, r5, [sp, #40] #endif # Round 6 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -1966,7 +2161,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -1976,25 +2171,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -2006,13 +2201,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #48] ldr r9, [sp, #52] #else @@ -2020,7 +2215,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #48] ldr r7, [r3, #52] #else @@ -2028,7 +2223,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -2036,7 +2231,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -2044,13 +2239,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #40] str r9, [r0, #44] #else @@ -2070,7 +2265,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -2080,19 +2275,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -2104,7 +2299,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else @@ -2112,7 +2307,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else @@ -2121,7 +2316,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[6] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #32] ldr r5, [sp, #36] #else @@ -2142,13 +2337,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #48] ldr r5, [sp, #52] #else ldrd r4, r5, [sp, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #120] ldr r9, [sp, #124] #else @@ -2158,13 +2353,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #48] str r5, [sp, #52] #else strd r4, r5, [sp, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #56] ldr r5, [sp, #60] #else @@ -2185,7 +2380,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #48] ldr r5, [sp, #52] #else @@ -2193,14 +2388,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #48] str r5, [sp, #52] #else strd r4, r5, [sp, #48] #endif # Round 7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -2220,7 +2415,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -2230,25 +2425,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -2260,13 +2455,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #56] ldr r9, [sp, #60] #else @@ -2274,7 +2469,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #56] ldr r7, [r3, #60] #else @@ -2282,7 +2477,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -2290,7 +2485,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -2298,13 +2493,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #32] str r9, [r0, #36] #else @@ -2324,7 +2519,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -2334,19 +2529,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else ldrd r8, r9, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -2358,7 +2553,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else @@ -2366,7 +2561,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0] str r7, [r0, #4] #else @@ -2375,7 +2570,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[7] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #40] ldr r5, [sp, #44] #else @@ -2396,13 +2591,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #56] ldr r5, [sp, #60] #else ldrd r4, r5, [sp, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp] ldr r9, [sp, #4] #else @@ -2412,13 +2607,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #56] str r5, [sp, #60] #else strd r4, r5, [sp, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #64] ldr r5, [sp, #68] #else @@ -2439,7 +2634,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #56] ldr r5, [sp, #60] #else @@ -2447,14 +2642,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #56] str r5, [sp, #60] #else strd r4, r5, [sp, #56] #endif # Round 8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -2474,7 +2669,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -2484,25 +2679,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else strd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -2514,13 +2709,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #64] ldr r9, [sp, #68] #else @@ -2528,7 +2723,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #64] ldr r7, [r3, #68] #else @@ -2536,7 +2731,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -2544,7 +2739,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -2552,13 +2747,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else @@ -2578,7 +2773,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -2588,19 +2783,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else ldrd r8, r9, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -2612,7 +2807,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else @@ -2620,7 +2815,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else @@ -2629,7 +2824,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[8] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #48] ldr r5, [sp, #52] #else @@ -2650,13 +2845,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #64] ldr r5, [sp, #68] #else ldrd r4, r5, [sp, #64] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #8] ldr r9, [sp, #12] #else @@ -2666,13 +2861,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #64] str r5, [sp, #68] #else strd r4, r5, [sp, #64] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #72] ldr r5, [sp, #76] #else @@ -2693,7 +2888,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #64] ldr r5, [sp, #68] #else @@ -2701,14 +2896,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #64] str r5, [sp, #68] #else strd r4, r5, [sp, #64] #endif # Round 9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -2728,7 +2923,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -2738,25 +2933,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else strd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -2768,13 +2963,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #72] ldr r9, [sp, #76] #else @@ -2782,7 +2977,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #72] ldr r7, [r3, #76] #else @@ -2790,7 +2985,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -2798,7 +2993,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -2806,13 +3001,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #16] str r9, [r0, #20] #else @@ -2832,7 +3027,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -2842,19 +3037,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else ldrd r8, r9, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -2866,7 +3061,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else @@ -2874,7 +3069,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #48] str r7, [r0, #52] #else @@ -2883,7 +3078,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[9] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #56] ldr r5, [sp, #60] #else @@ -2904,13 +3099,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #72] ldr r5, [sp, #76] #else ldrd r4, r5, [sp, #72] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #16] ldr r9, [sp, #20] #else @@ -2920,13 +3115,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #72] str r5, [sp, #76] #else strd r4, r5, [sp, #72] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #80] ldr r5, [sp, #84] #else @@ -2947,7 +3142,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #72] ldr r5, [sp, #76] #else @@ -2955,14 +3150,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #72] str r5, [sp, #76] #else strd r4, r5, [sp, #72] #endif # Round 10 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -2982,7 +3177,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -2992,25 +3187,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else strd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -3022,13 +3217,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #80] ldr r9, [sp, #84] #else @@ -3036,7 +3231,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #80] ldr r7, [r3, #84] #else @@ -3044,7 +3239,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -3052,7 +3247,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -3060,13 +3255,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #8] str r9, [r0, #12] #else @@ -3086,7 +3281,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -3096,19 +3291,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else ldrd r8, r9, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -3120,7 +3315,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else @@ -3128,7 +3323,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else @@ -3137,7 +3332,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[10] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #64] ldr r5, [sp, #68] #else @@ -3158,13 +3353,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #80] ldr r5, [sp, #84] #else ldrd r4, r5, [sp, #80] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #24] ldr r9, [sp, #28] #else @@ -3174,13 +3369,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #80] str r5, [sp, #84] #else strd r4, r5, [sp, #80] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #88] ldr r5, [sp, #92] #else @@ -3201,7 +3396,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #80] ldr r5, [sp, #84] #else @@ -3209,14 +3404,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #80] str r5, [sp, #84] #else strd r4, r5, [sp, #80] #endif # Round 11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -3236,7 +3431,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -3246,25 +3441,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else strd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -3276,13 +3471,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #88] ldr r9, [sp, #92] #else @@ -3290,7 +3485,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #88] ldr r7, [r3, #92] #else @@ -3298,7 +3493,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -3306,7 +3501,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -3314,13 +3509,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0] str r9, [r0, #4] #else @@ -3340,7 +3535,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -3350,19 +3545,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else ldrd r8, r9, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -3374,7 +3569,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else @@ -3382,7 +3577,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #32] str r7, [r0, #36] #else @@ -3391,7 +3586,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[11] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #72] ldr r5, [sp, #76] #else @@ -3412,13 +3607,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #88] ldr r5, [sp, #92] #else ldrd r4, r5, [sp, #88] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #32] ldr r9, [sp, #36] #else @@ -3428,13 +3623,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #88] str r5, [sp, #92] #else strd r4, r5, [sp, #88] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #96] ldr r5, [sp, #100] #else @@ -3455,7 +3650,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #88] ldr r5, [sp, #92] #else @@ -3463,14 +3658,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #88] str r5, [sp, #92] #else strd r4, r5, [sp, #88] #endif # Round 12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -3490,7 +3685,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -3500,25 +3695,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else strd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -3530,13 +3725,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #96] ldr r9, [sp, #100] #else @@ -3544,7 +3739,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #96] ldr r7, [r3, #100] #else @@ -3552,7 +3747,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -3560,7 +3755,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -3568,13 +3763,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #56] str r9, [r0, #60] #else @@ -3594,7 +3789,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -3604,19 +3799,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else ldrd r8, r9, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -3628,7 +3823,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else @@ -3636,7 +3831,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else @@ -3645,7 +3840,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[12] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #80] ldr r5, [sp, #84] #else @@ -3666,13 +3861,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #96] ldr r5, [sp, #100] #else ldrd r4, r5, [sp, #96] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #40] ldr r9, [sp, #44] #else @@ -3682,13 +3877,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #96] str r5, [sp, #100] #else strd r4, r5, [sp, #96] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #104] ldr r5, [sp, #108] #else @@ -3709,7 +3904,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #96] ldr r5, [sp, #100] #else @@ -3717,14 +3912,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #96] str r5, [sp, #100] #else strd r4, r5, [sp, #96] #endif # Round 13 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -3744,7 +3939,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -3754,25 +3949,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -3784,13 +3979,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #104] ldr r9, [sp, #108] #else @@ -3798,7 +3993,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #104] ldr r7, [r3, #108] #else @@ -3806,7 +4001,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -3814,7 +4009,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -3822,13 +4017,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #48] str r9, [r0, #52] #else @@ -3848,7 +4043,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -3858,19 +4053,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else ldrd r8, r9, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -3882,7 +4077,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else @@ -3890,7 +4085,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #16] str r7, [r0, #20] #else @@ -3899,7 +4094,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[13] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #88] ldr r5, [sp, #92] #else @@ -3920,13 +4115,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #104] ldr r5, [sp, #108] #else ldrd r4, r5, [sp, #104] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #48] ldr r9, [sp, #52] #else @@ -3936,13 +4131,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #104] str r5, [sp, #108] #else strd r4, r5, [sp, #104] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #112] ldr r5, [sp, #116] #else @@ -3963,7 +4158,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #104] ldr r5, [sp, #108] #else @@ -3971,14 +4166,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #104] str r5, [sp, #108] #else strd r4, r5, [sp, #104] #endif # Round 14 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -3998,7 +4193,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -4008,25 +4203,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -4038,13 +4233,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #112] ldr r9, [sp, #116] #else @@ -4052,7 +4247,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #112] ldr r7, [r3, #116] #else @@ -4060,7 +4255,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -4068,7 +4263,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -4076,13 +4271,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #40] str r9, [r0, #44] #else @@ -4102,7 +4297,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -4112,19 +4307,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -4136,7 +4331,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else @@ -4144,7 +4339,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else @@ -4153,7 +4348,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[14] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #96] ldr r5, [sp, #100] #else @@ -4174,13 +4369,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #112] ldr r5, [sp, #116] #else ldrd r4, r5, [sp, #112] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #56] ldr r9, [sp, #60] #else @@ -4190,13 +4385,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #112] str r5, [sp, #116] #else strd r4, r5, [sp, #112] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #120] ldr r5, [sp, #124] #else @@ -4217,7 +4412,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #112] ldr r5, [sp, #116] #else @@ -4225,14 +4420,14 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #112] str r5, [sp, #116] #else strd r4, r5, [sp, #112] #endif # Round 15 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -4252,7 +4447,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -4262,25 +4457,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -4292,13 +4487,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #120] ldr r9, [sp, #124] #else @@ -4306,7 +4501,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #120] ldr r7, [r3, #124] #else @@ -4314,7 +4509,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -4322,7 +4517,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -4330,13 +4525,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #32] str r9, [r0, #36] #else @@ -4356,7 +4551,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -4366,19 +4561,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else ldrd r8, r9, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -4390,7 +4585,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else @@ -4398,7 +4593,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0] str r7, [r0, #4] #else @@ -4407,7 +4602,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Calc new W[15] -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #104] ldr r5, [sp, #108] #else @@ -4428,13 +4623,13 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #26 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #120] ldr r5, [sp, #124] #else ldrd r4, r5, [sp, #120] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #64] ldr r9, [sp, #68] #else @@ -4444,13 +4639,13 @@ L_SHA512_transform_len_start: adc r5, r5, r7 adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #120] str r5, [sp, #124] #else strd r4, r5, [sp, #120] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp] ldr r5, [sp, #4] #else @@ -4471,7 +4666,7 @@ L_SHA512_transform_len_start: orr r8, r8, r5, lsl #25 eor r7, r7, r9 eor r6, r6, r8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [sp, #120] ldr r5, [sp, #124] #else @@ -4479,7 +4674,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #120] str r5, [sp, #124] #else @@ -4489,7 +4684,7 @@ L_SHA512_transform_len_start: subs r12, r12, #1 bne L_SHA512_transform_len_start # Round 0 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -4509,7 +4704,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -4519,25 +4714,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else strd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -4549,13 +4744,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp] ldr r9, [sp, #4] #else @@ -4563,7 +4758,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3] ldr r7, [r3, #4] #else @@ -4571,7 +4766,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -4579,7 +4774,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -4587,13 +4782,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else @@ -4613,7 +4808,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -4623,19 +4818,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else ldrd r8, r9, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -4647,7 +4842,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else @@ -4655,7 +4850,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else @@ -4664,7 +4859,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 1 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -4684,7 +4879,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -4694,25 +4889,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else strd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -4724,13 +4919,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #8] ldr r9, [sp, #12] #else @@ -4738,7 +4933,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #8] ldr r7, [r3, #12] #else @@ -4746,7 +4941,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -4754,7 +4949,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -4762,13 +4957,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #16] str r9, [r0, #20] #else @@ -4788,7 +4983,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -4798,19 +4993,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else ldrd r8, r9, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -4822,7 +5017,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else @@ -4830,7 +5025,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #48] str r7, [r0, #52] #else @@ -4839,7 +5034,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 2 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -4859,7 +5054,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -4869,25 +5064,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else strd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -4899,13 +5094,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #16] ldr r9, [sp, #20] #else @@ -4913,7 +5108,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #16] ldr r7, [r3, #20] #else @@ -4921,7 +5116,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -4929,7 +5124,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -4937,13 +5132,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #8] str r9, [r0, #12] #else @@ -4963,7 +5158,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -4973,19 +5168,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else ldrd r8, r9, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -4997,7 +5192,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else @@ -5005,7 +5200,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else @@ -5014,7 +5209,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 3 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -5034,7 +5229,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -5044,25 +5239,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else strd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -5074,13 +5269,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #24] ldr r9, [sp, #28] #else @@ -5088,7 +5283,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #24] ldr r7, [r3, #28] #else @@ -5096,7 +5291,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -5104,7 +5299,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -5112,13 +5307,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0] str r9, [r0, #4] #else @@ -5138,7 +5333,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -5148,19 +5343,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else ldrd r8, r9, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -5172,7 +5367,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else @@ -5180,7 +5375,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #32] str r7, [r0, #36] #else @@ -5189,7 +5384,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 4 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -5209,7 +5404,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -5219,25 +5414,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else strd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -5249,13 +5444,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #32] ldr r9, [sp, #36] #else @@ -5263,7 +5458,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #32] ldr r7, [r3, #36] #else @@ -5271,7 +5466,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -5279,7 +5474,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -5287,13 +5482,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #56] str r9, [r0, #60] #else @@ -5313,7 +5508,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -5323,19 +5518,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else ldrd r8, r9, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -5347,7 +5542,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else @@ -5355,7 +5550,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else @@ -5364,7 +5559,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -5384,7 +5579,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -5394,25 +5589,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -5424,13 +5619,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #40] ldr r9, [sp, #44] #else @@ -5438,7 +5633,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #40] ldr r7, [r3, #44] #else @@ -5446,7 +5641,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -5454,7 +5649,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -5462,13 +5657,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #48] str r9, [r0, #52] #else @@ -5488,7 +5683,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -5498,19 +5693,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else ldrd r8, r9, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -5522,7 +5717,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else @@ -5530,7 +5725,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #16] str r7, [r0, #20] #else @@ -5539,7 +5734,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 6 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -5559,7 +5754,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -5569,25 +5764,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -5599,13 +5794,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #48] ldr r9, [sp, #52] #else @@ -5613,7 +5808,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #48] ldr r7, [r3, #52] #else @@ -5621,7 +5816,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -5629,7 +5824,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -5637,13 +5832,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #40] str r9, [r0, #44] #else @@ -5663,7 +5858,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -5673,19 +5868,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -5697,7 +5892,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else @@ -5705,7 +5900,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else @@ -5714,7 +5909,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -5734,7 +5929,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -5744,25 +5939,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -5774,13 +5969,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #56] ldr r9, [sp, #60] #else @@ -5788,7 +5983,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #56] ldr r7, [r3, #60] #else @@ -5796,7 +5991,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -5804,7 +5999,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -5812,13 +6007,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #32] str r9, [r0, #36] #else @@ -5838,7 +6033,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -5848,19 +6043,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else ldrd r8, r9, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -5872,7 +6067,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else @@ -5880,7 +6075,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0] str r7, [r0, #4] #else @@ -5889,7 +6084,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 8 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -5909,7 +6104,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -5919,25 +6114,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else strd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -5949,13 +6144,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #64] ldr r9, [sp, #68] #else @@ -5963,7 +6158,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #64] ldr r7, [r3, #68] #else @@ -5971,7 +6166,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -5979,7 +6174,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -5987,13 +6182,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #24] str r9, [r0, #28] #else @@ -6013,7 +6208,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -6023,19 +6218,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else ldrd r8, r9, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #56] str r5, [r0, #60] #else @@ -6047,7 +6242,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else @@ -6055,7 +6250,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else @@ -6064,7 +6259,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -6084,7 +6279,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -6094,25 +6289,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else strd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -6124,13 +6319,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #72] ldr r9, [sp, #76] #else @@ -6138,7 +6333,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #72] ldr r7, [r3, #76] #else @@ -6146,7 +6341,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -6154,7 +6349,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -6162,13 +6357,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #16] str r9, [r0, #20] #else @@ -6188,7 +6383,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -6198,19 +6393,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else ldrd r8, r9, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else @@ -6222,7 +6417,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else @@ -6230,7 +6425,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #48] str r7, [r0, #52] #else @@ -6239,7 +6434,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 10 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -6259,7 +6454,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -6269,25 +6464,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else strd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -6299,13 +6494,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #80] ldr r9, [sp, #84] #else @@ -6313,7 +6508,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #80] ldr r7, [r3, #84] #else @@ -6321,7 +6516,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -6329,7 +6524,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -6337,13 +6532,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #8] str r9, [r0, #12] #else @@ -6363,7 +6558,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -6373,19 +6568,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else ldrd r8, r9, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #40] str r5, [r0, #44] #else @@ -6397,7 +6592,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else @@ -6405,7 +6600,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else @@ -6414,7 +6609,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -6434,7 +6629,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -6444,25 +6639,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else strd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else @@ -6474,13 +6669,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #88] ldr r9, [sp, #92] #else @@ -6488,7 +6683,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #88] ldr r7, [r3, #92] #else @@ -6496,7 +6691,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -6504,7 +6699,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -6512,13 +6707,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0] str r9, [r0, #4] #else @@ -6538,7 +6733,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else @@ -6548,19 +6743,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else ldrd r8, r9, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else @@ -6572,7 +6767,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else @@ -6580,7 +6775,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #32] str r7, [r0, #36] #else @@ -6589,7 +6784,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 12 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -6609,7 +6804,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -6619,25 +6814,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else strd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else @@ -6649,13 +6844,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #96] ldr r9, [sp, #100] #else @@ -6663,7 +6858,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #96] ldr r7, [r3, #100] #else @@ -6671,7 +6866,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -6679,7 +6874,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -6687,13 +6882,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #56] str r9, [r0, #60] #else @@ -6713,7 +6908,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else @@ -6723,19 +6918,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else ldrd r8, r9, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #24] str r5, [r0, #28] #else @@ -6747,7 +6942,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else @@ -6755,7 +6950,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else @@ -6764,7 +6959,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 13 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else @@ -6784,7 +6979,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -6794,25 +6989,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #56] ldr r5, [r0, #60] #else ldrd r4, r5, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else ldrd r6, r7, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else @@ -6824,13 +7019,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #104] ldr r9, [sp, #108] #else @@ -6838,7 +7033,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #104] ldr r7, [r3, #108] #else @@ -6846,7 +7041,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #48] ldr r9, [r0, #52] #else @@ -6854,7 +7049,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -6862,13 +7057,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #24] ldr r5, [r0, #28] #else ldrd r4, r5, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #48] str r9, [r0, #52] #else @@ -6888,7 +7083,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else @@ -6898,19 +7093,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #24] ldr r9, [r0, #28] #else ldrd r8, r9, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #32] ldr r7, [r0, #36] #else ldrd r6, r7, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else @@ -6922,7 +7117,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else @@ -6930,7 +7125,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #16] str r7, [r0, #20] #else @@ -6939,7 +7134,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 14 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else @@ -6959,7 +7154,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -6969,25 +7164,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else strd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0] ldr r9, [r0, #4] #else @@ -6999,13 +7194,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #112] ldr r9, [sp, #116] #else @@ -7013,7 +7208,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #112] ldr r7, [r3, #116] #else @@ -7021,7 +7216,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #40] ldr r9, [r0, #44] #else @@ -7029,7 +7224,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -7037,13 +7232,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #40] str r9, [r0, #44] #else @@ -7063,7 +7258,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else @@ -7073,19 +7268,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #16] ldr r9, [r0, #20] #else ldrd r8, r9, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #8] str r5, [r0, #12] #else @@ -7097,7 +7292,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else @@ -7105,7 +7300,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else @@ -7114,7 +7309,7 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Round 15 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else @@ -7134,7 +7329,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #23 orr r9, r9, r4, lsr #9 orr r8, r8, r5, lsr #9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -7144,25 +7339,25 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #40] ldr r5, [r0, #44] #else ldrd r4, r5, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #48] ldr r7, [r0, #52] #else ldrd r6, r7, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #56] ldr r9, [r0, #60] #else @@ -7174,13 +7369,13 @@ L_SHA512_transform_len_start: and r7, r7, r5 eor r6, r6, r8 eor r7, r7, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #120] ldr r9, [sp, #124] #else @@ -7188,7 +7383,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r3, #120] ldr r7, [r3, #124] #else @@ -7196,7 +7391,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r8 adc r5, r5, r9 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #32] ldr r9, [r0, #36] #else @@ -7204,7 +7399,7 @@ L_SHA512_transform_len_start: #endif adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -7212,13 +7407,13 @@ L_SHA512_transform_len_start: #endif adds r8, r8, r4 adc r9, r9, r5 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #8] ldr r5, [r0, #12] #else ldrd r4, r5, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r8, [r0, #32] str r9, [r0, #36] #else @@ -7238,7 +7433,7 @@ L_SHA512_transform_len_start: lsls r9, r5, #25 orr r9, r9, r4, lsr #7 orr r8, r8, r5, lsr #7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else @@ -7248,19 +7443,19 @@ L_SHA512_transform_len_start: eor r7, r7, r9 adds r4, r4, r6 adc r5, r5, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [r0, #8] ldr r9, [r0, #12] #else ldrd r8, r9, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #16] ldr r7, [r0, #20] #else ldrd r6, r7, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else @@ -7272,7 +7467,7 @@ L_SHA512_transform_len_start: and r11, r11, r9 eor r10, r10, r6 eor r11, r11, r7 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0] ldr r7, [r0, #4] #else @@ -7280,7 +7475,7 @@ L_SHA512_transform_len_start: #endif adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0] str r7, [r0, #4] #else @@ -7289,25 +7484,25 @@ L_SHA512_transform_len_start: mov r10, r8 mov r11, r9 # Add in digest from start -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0] ldr r5, [r0, #4] #else ldrd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #8] ldr r7, [r0, #12] #else ldrd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #128] ldr r9, [sp, #132] #else ldrd r8, r9, [sp, #128] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #136] ldr r11, [sp, #140] #else @@ -7317,49 +7512,49 @@ L_SHA512_transform_len_start: adc r5, r5, r9 adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0] str r5, [r0, #4] #else strd r4, r5, [r0] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #8] str r7, [r0, #12] #else strd r6, r7, [r0, #8] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #128] str r5, [sp, #132] #else strd r4, r5, [sp, #128] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #136] str r7, [sp, #140] #else strd r6, r7, [sp, #136] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] #else ldrd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] #else ldrd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #144] ldr r9, [sp, #148] #else ldrd r8, r9, [sp, #144] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #152] ldr r11, [sp, #156] #else @@ -7369,49 +7564,49 @@ L_SHA512_transform_len_start: adc r5, r5, r9 adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #16] str r5, [r0, #20] #else strd r4, r5, [r0, #16] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #24] str r7, [r0, #28] #else strd r6, r7, [r0, #24] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #144] str r5, [sp, #148] #else strd r4, r5, [sp, #144] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #152] str r7, [sp, #156] #else strd r6, r7, [sp, #152] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #32] ldr r5, [r0, #36] #else ldrd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #40] ldr r7, [r0, #44] #else ldrd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #160] ldr r9, [sp, #164] #else ldrd r8, r9, [sp, #160] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #168] ldr r11, [sp, #172] #else @@ -7421,49 +7616,49 @@ L_SHA512_transform_len_start: adc r5, r5, r9 adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #32] str r5, [r0, #36] #else strd r4, r5, [r0, #32] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #40] str r7, [r0, #44] #else strd r6, r7, [r0, #40] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #160] str r5, [sp, #164] #else strd r4, r5, [sp, #160] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #168] str r7, [sp, #172] #else strd r6, r7, [sp, #168] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #48] ldr r5, [r0, #52] #else ldrd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #56] ldr r7, [r0, #60] #else ldrd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r8, [sp, #176] ldr r9, [sp, #180] #else ldrd r8, r9, [sp, #176] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r10, [sp, #184] ldr r11, [sp, #188] #else @@ -7473,25 +7668,25 @@ L_SHA512_transform_len_start: adc r5, r5, r9 adds r6, r6, r10 adc r7, r7, r11 -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [r0, #48] str r5, [r0, #52] #else strd r4, r5, [r0, #48] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [r0, #56] str r7, [r0, #60] #else strd r6, r7, [r0, #56] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r4, [sp, #176] str r5, [sp, #180] #else strd r4, r5, [sp, #176] #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) str r6, [sp, #184] str r7, [sp, #188] #else diff --git a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c index 0171ea883..c2ffa77b9 100644 --- a/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-sha512-asm_c.c @@ -39,6 +39,18 @@ #include #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifdef WOLFSSL_SHA512 #include @@ -97,97 +109,97 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) __asm__ __volatile__ ( "sub sp, sp, #0xc0\n\t" /* Copy digest to add in at end */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else "ldrd r8, r9, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[sha512], #24]\n\t" "ldr r11, [%[sha512], #28]\n\t" #else "ldrd r10, r11, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #128]\n\t" "str r5, [sp, #132]\n\t" #else "strd r4, r5, [sp, #128]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #136]\n\t" "str r7, [sp, #140]\n\t" #else "strd r6, r7, [sp, #136]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #144]\n\t" "str r9, [sp, #148]\n\t" #else "strd r8, r9, [sp, #144]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [sp, #152]\n\t" "str r11, [sp, #156]\n\t" #else "strd r10, r11, [sp, #152]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else "ldrd r8, r9, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[sha512], #56]\n\t" "ldr r11, [%[sha512], #60]\n\t" #else "ldrd r10, r11, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #160]\n\t" "str r5, [sp, #164]\n\t" #else "strd r4, r5, [sp, #160]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #168]\n\t" "str r7, [sp, #172]\n\t" #else "strd r6, r7, [sp, #168]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [sp, #176]\n\t" "str r9, [sp, #180]\n\t" #else "strd r8, r9, [sp, #176]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [sp, #184]\n\t" "str r11, [sp, #188]\n\t" #else @@ -196,7 +208,201 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) /* Start of loop processing a block */ "\n" "L_SHA512_transform_len_begin_%=: \n\t" - /* Load, Reverse and Store W */ + /* Load, Reverse and Store W - 64 bytes */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "ldr r4, [%[data]]\n\t" + "ldr r5, [%[data], #4]\n\t" + "ldr r6, [%[data], #8]\n\t" + "ldr r7, [%[data], #12]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp]\n\t" + "str r4, [sp, #4]\n\t" + "str r7, [sp, #8]\n\t" + "str r6, [sp, #12]\n\t" + "ldr r4, [%[data], #16]\n\t" + "ldr r5, [%[data], #20]\n\t" + "ldr r6, [%[data], #24]\n\t" + "ldr r7, [%[data], #28]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #16]\n\t" + "str r4, [sp, #20]\n\t" + "str r7, [sp, #24]\n\t" + "str r6, [sp, #28]\n\t" + "ldr r4, [%[data], #32]\n\t" + "ldr r5, [%[data], #36]\n\t" + "ldr r6, [%[data], #40]\n\t" + "ldr r7, [%[data], #44]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #32]\n\t" + "str r4, [sp, #36]\n\t" + "str r7, [sp, #40]\n\t" + "str r6, [sp, #44]\n\t" + "ldr r4, [%[data], #48]\n\t" + "ldr r5, [%[data], #52]\n\t" + "ldr r6, [%[data], #56]\n\t" + "ldr r7, [%[data], #60]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #48]\n\t" + "str r4, [sp, #52]\n\t" + "str r7, [sp, #56]\n\t" + "str r6, [sp, #60]\n\t" + "ldr r4, [%[data], #64]\n\t" + "ldr r5, [%[data], #68]\n\t" + "ldr r6, [%[data], #72]\n\t" + "ldr r7, [%[data], #76]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #64]\n\t" + "str r4, [sp, #68]\n\t" + "str r7, [sp, #72]\n\t" + "str r6, [sp, #76]\n\t" + "ldr r4, [%[data], #80]\n\t" + "ldr r5, [%[data], #84]\n\t" + "ldr r6, [%[data], #88]\n\t" + "ldr r7, [%[data], #92]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #80]\n\t" + "str r4, [sp, #84]\n\t" + "str r7, [sp, #88]\n\t" + "str r6, [sp, #92]\n\t" + "ldr r4, [%[data], #96]\n\t" + "ldr r5, [%[data], #100]\n\t" + "ldr r6, [%[data], #104]\n\t" + "ldr r7, [%[data], #108]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #96]\n\t" + "str r4, [sp, #100]\n\t" + "str r7, [sp, #104]\n\t" + "str r6, [sp, #108]\n\t" + "ldr r4, [%[data], #112]\n\t" + "ldr r5, [%[data], #116]\n\t" + "ldr r6, [%[data], #120]\n\t" + "ldr r7, [%[data], #124]\n\t" + "eor r8, r4, r4, ror #16\n\t" + "eor r9, r5, r5, ror #16\n\t" + "eor r10, r6, r6, ror #16\n\t" + "eor r11, r7, r7, ror #16\n\t" + "bic r8, r8, #0xff0000\n\t" + "bic r9, r9, #0xff0000\n\t" + "bic r10, r10, #0xff0000\n\t" + "bic r11, r11, #0xff0000\n\t" + "ror r4, r4, #8\n\t" + "ror r5, r5, #8\n\t" + "ror r6, r6, #8\n\t" + "ror r7, r7, #8\n\t" + "eor r4, r4, r8, lsr #8\n\t" + "eor r5, r5, r9, lsr #8\n\t" + "eor r6, r6, r10, lsr #8\n\t" + "eor r7, r7, r11, lsr #8\n\t" + "str r5, [sp, #112]\n\t" + "str r4, [sp, #116]\n\t" + "str r7, [sp, #120]\n\t" + "str r6, [sp, #124]\n\t" +#else "ldr r4, [%[data]]\n\t" "ldr r5, [%[data], #4]\n\t" "ldr r6, [%[data], #8]\n\t" @@ -293,14 +499,15 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "str r8, [sp, #116]\n\t" "str r11, [sp, #120]\n\t" "str r10, [sp, #124]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ /* Pre-calc: b ^ c */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [%[sha512], #8]\n\t" "ldr r11, [%[sha512], #12]\n\t" #else "ldrd r10, r11, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -313,7 +520,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "\n" "L_SHA512_transform_len_start_%=: \n\t" /* Round 0 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -333,7 +540,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -343,25 +550,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else "strd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -373,13 +580,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp]\n\t" "ldr r9, [sp, #4]\n\t" #else @@ -387,7 +594,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3]\n\t" "ldr r7, [r3, #4]\n\t" #else @@ -395,7 +602,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -403,7 +610,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -411,13 +618,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #24]\n\t" "str r9, [%[sha512], #28]\n\t" #else @@ -437,7 +644,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -447,19 +654,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else "ldrd r8, r9, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -471,7 +678,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else @@ -479,7 +686,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #56]\n\t" "str r7, [%[sha512], #60]\n\t" #else @@ -488,7 +695,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[0] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #112]\n\t" "ldr r5, [sp, #116]\n\t" #else @@ -509,13 +716,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp]\n\t" "ldr r5, [sp, #4]\n\t" #else "ldrd r4, r5, [sp]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #72]\n\t" "ldr r9, [sp, #76]\n\t" #else @@ -525,13 +732,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp]\n\t" "str r5, [sp, #4]\n\t" #else "strd r4, r5, [sp]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #8]\n\t" "ldr r5, [sp, #12]\n\t" #else @@ -552,7 +759,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp]\n\t" "ldr r5, [sp, #4]\n\t" #else @@ -560,14 +767,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp]\n\t" "str r5, [sp, #4]\n\t" #else "strd r4, r5, [sp]\n\t" #endif /* Round 1 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -587,7 +794,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -597,25 +804,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else "strd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -627,13 +834,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #8]\n\t" "ldr r9, [sp, #12]\n\t" #else @@ -641,7 +848,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #8]\n\t" "ldr r7, [r3, #12]\n\t" #else @@ -649,7 +856,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -657,7 +864,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -665,13 +872,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #16]\n\t" "str r9, [%[sha512], #20]\n\t" #else @@ -691,7 +898,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -701,19 +908,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else "ldrd r8, r9, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -725,7 +932,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else @@ -733,7 +940,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #48]\n\t" "str r7, [%[sha512], #52]\n\t" #else @@ -742,7 +949,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[1] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #120]\n\t" "ldr r5, [sp, #124]\n\t" #else @@ -763,13 +970,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #8]\n\t" "ldr r5, [sp, #12]\n\t" #else "ldrd r4, r5, [sp, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #80]\n\t" "ldr r9, [sp, #84]\n\t" #else @@ -779,13 +986,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #8]\n\t" "str r5, [sp, #12]\n\t" #else "strd r4, r5, [sp, #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #16]\n\t" "ldr r5, [sp, #20]\n\t" #else @@ -806,7 +1013,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #8]\n\t" "ldr r5, [sp, #12]\n\t" #else @@ -814,14 +1021,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #8]\n\t" "str r5, [sp, #12]\n\t" #else "strd r4, r5, [sp, #8]\n\t" #endif /* Round 2 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -841,7 +1048,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -851,25 +1058,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else "strd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -881,13 +1088,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #16]\n\t" "ldr r9, [sp, #20]\n\t" #else @@ -895,7 +1102,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #16]\n\t" "ldr r7, [r3, #20]\n\t" #else @@ -903,7 +1110,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -911,7 +1118,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -919,13 +1126,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #8]\n\t" "str r9, [%[sha512], #12]\n\t" #else @@ -945,7 +1152,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -955,19 +1162,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else "ldrd r8, r9, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -979,7 +1186,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else @@ -987,7 +1194,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #40]\n\t" "str r7, [%[sha512], #44]\n\t" #else @@ -996,7 +1203,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[2] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp]\n\t" "ldr r5, [sp, #4]\n\t" #else @@ -1017,13 +1224,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #16]\n\t" "ldr r5, [sp, #20]\n\t" #else "ldrd r4, r5, [sp, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #88]\n\t" "ldr r9, [sp, #92]\n\t" #else @@ -1033,13 +1240,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #16]\n\t" "str r5, [sp, #20]\n\t" #else "strd r4, r5, [sp, #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #24]\n\t" "ldr r5, [sp, #28]\n\t" #else @@ -1060,7 +1267,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #16]\n\t" "ldr r5, [sp, #20]\n\t" #else @@ -1068,14 +1275,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #16]\n\t" "str r5, [sp, #20]\n\t" #else "strd r4, r5, [sp, #16]\n\t" #endif /* Round 3 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -1095,7 +1302,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -1105,25 +1312,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else "strd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -1135,13 +1342,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #24]\n\t" "ldr r9, [sp, #28]\n\t" #else @@ -1149,7 +1356,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #24]\n\t" "ldr r7, [r3, #28]\n\t" #else @@ -1157,7 +1364,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -1165,7 +1372,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -1173,13 +1380,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512]]\n\t" "str r9, [%[sha512], #4]\n\t" #else @@ -1199,7 +1406,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -1209,19 +1416,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else "ldrd r8, r9, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -1233,7 +1440,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else @@ -1241,7 +1448,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #32]\n\t" "str r7, [%[sha512], #36]\n\t" #else @@ -1250,7 +1457,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[3] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #8]\n\t" "ldr r5, [sp, #12]\n\t" #else @@ -1271,13 +1478,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #24]\n\t" "ldr r5, [sp, #28]\n\t" #else "ldrd r4, r5, [sp, #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #96]\n\t" "ldr r9, [sp, #100]\n\t" #else @@ -1287,13 +1494,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #24]\n\t" "str r5, [sp, #28]\n\t" #else "strd r4, r5, [sp, #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #32]\n\t" "ldr r5, [sp, #36]\n\t" #else @@ -1314,7 +1521,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #24]\n\t" "ldr r5, [sp, #28]\n\t" #else @@ -1322,14 +1529,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #24]\n\t" "str r5, [sp, #28]\n\t" #else "strd r4, r5, [sp, #24]\n\t" #endif /* Round 4 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -1349,7 +1556,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -1359,25 +1566,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else "strd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -1389,13 +1596,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #32]\n\t" "ldr r9, [sp, #36]\n\t" #else @@ -1403,7 +1610,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #32]\n\t" "ldr r7, [r3, #36]\n\t" #else @@ -1411,7 +1618,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -1419,7 +1626,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -1427,13 +1634,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #56]\n\t" "str r9, [%[sha512], #60]\n\t" #else @@ -1453,7 +1660,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -1463,19 +1670,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else "ldrd r8, r9, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -1487,7 +1694,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else @@ -1495,7 +1702,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #24]\n\t" "str r7, [%[sha512], #28]\n\t" #else @@ -1504,7 +1711,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[4] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #16]\n\t" "ldr r5, [sp, #20]\n\t" #else @@ -1525,13 +1732,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #32]\n\t" "ldr r5, [sp, #36]\n\t" #else "ldrd r4, r5, [sp, #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #104]\n\t" "ldr r9, [sp, #108]\n\t" #else @@ -1541,13 +1748,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #32]\n\t" "str r5, [sp, #36]\n\t" #else "strd r4, r5, [sp, #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #40]\n\t" "ldr r5, [sp, #44]\n\t" #else @@ -1568,7 +1775,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #32]\n\t" "ldr r5, [sp, #36]\n\t" #else @@ -1576,14 +1783,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #32]\n\t" "str r5, [sp, #36]\n\t" #else "strd r4, r5, [sp, #32]\n\t" #endif /* Round 5 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -1603,7 +1810,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -1613,25 +1820,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else "strd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -1643,13 +1850,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #40]\n\t" "ldr r9, [sp, #44]\n\t" #else @@ -1657,7 +1864,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #40]\n\t" "ldr r7, [r3, #44]\n\t" #else @@ -1665,7 +1872,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -1673,7 +1880,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -1681,13 +1888,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #48]\n\t" "str r9, [%[sha512], #52]\n\t" #else @@ -1707,7 +1914,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -1717,19 +1924,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else "ldrd r8, r9, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -1741,7 +1948,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else @@ -1749,7 +1956,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #16]\n\t" "str r7, [%[sha512], #20]\n\t" #else @@ -1758,7 +1965,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[5] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #24]\n\t" "ldr r5, [sp, #28]\n\t" #else @@ -1779,13 +1986,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #40]\n\t" "ldr r5, [sp, #44]\n\t" #else "ldrd r4, r5, [sp, #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #112]\n\t" "ldr r9, [sp, #116]\n\t" #else @@ -1795,13 +2002,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #40]\n\t" "str r5, [sp, #44]\n\t" #else "strd r4, r5, [sp, #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #48]\n\t" "ldr r5, [sp, #52]\n\t" #else @@ -1822,7 +2029,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #40]\n\t" "ldr r5, [sp, #44]\n\t" #else @@ -1830,14 +2037,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #40]\n\t" "str r5, [sp, #44]\n\t" #else "strd r4, r5, [sp, #40]\n\t" #endif /* Round 6 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -1857,7 +2064,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -1867,25 +2074,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else "strd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -1897,13 +2104,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #48]\n\t" "ldr r9, [sp, #52]\n\t" #else @@ -1911,7 +2118,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #48]\n\t" "ldr r7, [r3, #52]\n\t" #else @@ -1919,7 +2126,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -1927,7 +2134,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -1935,13 +2142,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #40]\n\t" "str r9, [%[sha512], #44]\n\t" #else @@ -1961,7 +2168,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -1971,19 +2178,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else "ldrd r8, r9, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -1995,7 +2202,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else @@ -2003,7 +2210,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #8]\n\t" "str r7, [%[sha512], #12]\n\t" #else @@ -2012,7 +2219,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[6] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #32]\n\t" "ldr r5, [sp, #36]\n\t" #else @@ -2033,13 +2240,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #48]\n\t" "ldr r5, [sp, #52]\n\t" #else "ldrd r4, r5, [sp, #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #120]\n\t" "ldr r9, [sp, #124]\n\t" #else @@ -2049,13 +2256,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #48]\n\t" "str r5, [sp, #52]\n\t" #else "strd r4, r5, [sp, #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #56]\n\t" "ldr r5, [sp, #60]\n\t" #else @@ -2076,7 +2283,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #48]\n\t" "ldr r5, [sp, #52]\n\t" #else @@ -2084,14 +2291,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #48]\n\t" "str r5, [sp, #52]\n\t" #else "strd r4, r5, [sp, #48]\n\t" #endif /* Round 7 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -2111,7 +2318,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -2121,25 +2328,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else "strd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -2151,13 +2358,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #56]\n\t" "ldr r9, [sp, #60]\n\t" #else @@ -2165,7 +2372,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #56]\n\t" "ldr r7, [r3, #60]\n\t" #else @@ -2173,7 +2380,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -2181,7 +2388,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -2189,13 +2396,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #32]\n\t" "str r9, [%[sha512], #36]\n\t" #else @@ -2215,7 +2422,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -2225,19 +2432,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else "ldrd r8, r9, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -2249,7 +2456,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else @@ -2257,7 +2464,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512]]\n\t" "str r7, [%[sha512], #4]\n\t" #else @@ -2266,7 +2473,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[7] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #40]\n\t" "ldr r5, [sp, #44]\n\t" #else @@ -2287,13 +2494,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #56]\n\t" "ldr r5, [sp, #60]\n\t" #else "ldrd r4, r5, [sp, #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp]\n\t" "ldr r9, [sp, #4]\n\t" #else @@ -2303,13 +2510,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #56]\n\t" "str r5, [sp, #60]\n\t" #else "strd r4, r5, [sp, #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #64]\n\t" "ldr r5, [sp, #68]\n\t" #else @@ -2330,7 +2537,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #56]\n\t" "ldr r5, [sp, #60]\n\t" #else @@ -2338,14 +2545,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #56]\n\t" "str r5, [sp, #60]\n\t" #else "strd r4, r5, [sp, #56]\n\t" #endif /* Round 8 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -2365,7 +2572,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -2375,25 +2582,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else "strd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -2405,13 +2612,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #64]\n\t" "ldr r9, [sp, #68]\n\t" #else @@ -2419,7 +2626,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #64]\n\t" "ldr r7, [r3, #68]\n\t" #else @@ -2427,7 +2634,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -2435,7 +2642,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -2443,13 +2650,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #24]\n\t" "str r9, [%[sha512], #28]\n\t" #else @@ -2469,7 +2676,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -2479,19 +2686,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else "ldrd r8, r9, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -2503,7 +2710,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else @@ -2511,7 +2718,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #56]\n\t" "str r7, [%[sha512], #60]\n\t" #else @@ -2520,7 +2727,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[8] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #48]\n\t" "ldr r5, [sp, #52]\n\t" #else @@ -2541,13 +2748,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #64]\n\t" "ldr r5, [sp, #68]\n\t" #else "ldrd r4, r5, [sp, #64]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #8]\n\t" "ldr r9, [sp, #12]\n\t" #else @@ -2557,13 +2764,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #64]\n\t" "str r5, [sp, #68]\n\t" #else "strd r4, r5, [sp, #64]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #72]\n\t" "ldr r5, [sp, #76]\n\t" #else @@ -2584,7 +2791,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #64]\n\t" "ldr r5, [sp, #68]\n\t" #else @@ -2592,14 +2799,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #64]\n\t" "str r5, [sp, #68]\n\t" #else "strd r4, r5, [sp, #64]\n\t" #endif /* Round 9 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -2619,7 +2826,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -2629,25 +2836,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else "strd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -2659,13 +2866,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #72]\n\t" "ldr r9, [sp, #76]\n\t" #else @@ -2673,7 +2880,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #72]\n\t" "ldr r7, [r3, #76]\n\t" #else @@ -2681,7 +2888,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -2689,7 +2896,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -2697,13 +2904,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #16]\n\t" "str r9, [%[sha512], #20]\n\t" #else @@ -2723,7 +2930,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -2733,19 +2940,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else "ldrd r8, r9, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -2757,7 +2964,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else @@ -2765,7 +2972,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #48]\n\t" "str r7, [%[sha512], #52]\n\t" #else @@ -2774,7 +2981,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[9] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #56]\n\t" "ldr r5, [sp, #60]\n\t" #else @@ -2795,13 +3002,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #72]\n\t" "ldr r5, [sp, #76]\n\t" #else "ldrd r4, r5, [sp, #72]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #16]\n\t" "ldr r9, [sp, #20]\n\t" #else @@ -2811,13 +3018,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #72]\n\t" "str r5, [sp, #76]\n\t" #else "strd r4, r5, [sp, #72]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #80]\n\t" "ldr r5, [sp, #84]\n\t" #else @@ -2838,7 +3045,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #72]\n\t" "ldr r5, [sp, #76]\n\t" #else @@ -2846,14 +3053,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #72]\n\t" "str r5, [sp, #76]\n\t" #else "strd r4, r5, [sp, #72]\n\t" #endif /* Round 10 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -2873,7 +3080,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -2883,25 +3090,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else "strd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -2913,13 +3120,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #80]\n\t" "ldr r9, [sp, #84]\n\t" #else @@ -2927,7 +3134,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #80]\n\t" "ldr r7, [r3, #84]\n\t" #else @@ -2935,7 +3142,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -2943,7 +3150,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -2951,13 +3158,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #8]\n\t" "str r9, [%[sha512], #12]\n\t" #else @@ -2977,7 +3184,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -2987,19 +3194,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else "ldrd r8, r9, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -3011,7 +3218,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else @@ -3019,7 +3226,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #40]\n\t" "str r7, [%[sha512], #44]\n\t" #else @@ -3028,7 +3235,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[10] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #64]\n\t" "ldr r5, [sp, #68]\n\t" #else @@ -3049,13 +3256,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #80]\n\t" "ldr r5, [sp, #84]\n\t" #else "ldrd r4, r5, [sp, #80]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #24]\n\t" "ldr r9, [sp, #28]\n\t" #else @@ -3065,13 +3272,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #80]\n\t" "str r5, [sp, #84]\n\t" #else "strd r4, r5, [sp, #80]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #88]\n\t" "ldr r5, [sp, #92]\n\t" #else @@ -3092,7 +3299,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #80]\n\t" "ldr r5, [sp, #84]\n\t" #else @@ -3100,14 +3307,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #80]\n\t" "str r5, [sp, #84]\n\t" #else "strd r4, r5, [sp, #80]\n\t" #endif /* Round 11 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -3127,7 +3334,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -3137,25 +3344,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else "strd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -3167,13 +3374,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #88]\n\t" "ldr r9, [sp, #92]\n\t" #else @@ -3181,7 +3388,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #88]\n\t" "ldr r7, [r3, #92]\n\t" #else @@ -3189,7 +3396,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -3197,7 +3404,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -3205,13 +3412,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512]]\n\t" "str r9, [%[sha512], #4]\n\t" #else @@ -3231,7 +3438,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -3241,19 +3448,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else "ldrd r8, r9, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -3265,7 +3472,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else @@ -3273,7 +3480,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #32]\n\t" "str r7, [%[sha512], #36]\n\t" #else @@ -3282,7 +3489,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[11] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #72]\n\t" "ldr r5, [sp, #76]\n\t" #else @@ -3303,13 +3510,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #88]\n\t" "ldr r5, [sp, #92]\n\t" #else "ldrd r4, r5, [sp, #88]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #32]\n\t" "ldr r9, [sp, #36]\n\t" #else @@ -3319,13 +3526,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #88]\n\t" "str r5, [sp, #92]\n\t" #else "strd r4, r5, [sp, #88]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #96]\n\t" "ldr r5, [sp, #100]\n\t" #else @@ -3346,7 +3553,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #88]\n\t" "ldr r5, [sp, #92]\n\t" #else @@ -3354,14 +3561,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #88]\n\t" "str r5, [sp, #92]\n\t" #else "strd r4, r5, [sp, #88]\n\t" #endif /* Round 12 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -3381,7 +3588,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -3391,25 +3598,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else "strd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -3421,13 +3628,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #96]\n\t" "ldr r9, [sp, #100]\n\t" #else @@ -3435,7 +3642,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #96]\n\t" "ldr r7, [r3, #100]\n\t" #else @@ -3443,7 +3650,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -3451,7 +3658,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -3459,13 +3666,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #56]\n\t" "str r9, [%[sha512], #60]\n\t" #else @@ -3485,7 +3692,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -3495,19 +3702,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else "ldrd r8, r9, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -3519,7 +3726,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else @@ -3527,7 +3734,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #24]\n\t" "str r7, [%[sha512], #28]\n\t" #else @@ -3536,7 +3743,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[12] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #80]\n\t" "ldr r5, [sp, #84]\n\t" #else @@ -3557,13 +3764,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #96]\n\t" "ldr r5, [sp, #100]\n\t" #else "ldrd r4, r5, [sp, #96]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #40]\n\t" "ldr r9, [sp, #44]\n\t" #else @@ -3573,13 +3780,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #96]\n\t" "str r5, [sp, #100]\n\t" #else "strd r4, r5, [sp, #96]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #104]\n\t" "ldr r5, [sp, #108]\n\t" #else @@ -3600,7 +3807,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #96]\n\t" "ldr r5, [sp, #100]\n\t" #else @@ -3608,14 +3815,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #96]\n\t" "str r5, [sp, #100]\n\t" #else "strd r4, r5, [sp, #96]\n\t" #endif /* Round 13 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -3635,7 +3842,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -3645,25 +3852,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else "strd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -3675,13 +3882,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #104]\n\t" "ldr r9, [sp, #108]\n\t" #else @@ -3689,7 +3896,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #104]\n\t" "ldr r7, [r3, #108]\n\t" #else @@ -3697,7 +3904,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -3705,7 +3912,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -3713,13 +3920,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #48]\n\t" "str r9, [%[sha512], #52]\n\t" #else @@ -3739,7 +3946,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -3749,19 +3956,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else "ldrd r8, r9, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -3773,7 +3980,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else @@ -3781,7 +3988,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #16]\n\t" "str r7, [%[sha512], #20]\n\t" #else @@ -3790,7 +3997,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[13] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #88]\n\t" "ldr r5, [sp, #92]\n\t" #else @@ -3811,13 +4018,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #104]\n\t" "ldr r5, [sp, #108]\n\t" #else "ldrd r4, r5, [sp, #104]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #48]\n\t" "ldr r9, [sp, #52]\n\t" #else @@ -3827,13 +4034,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #104]\n\t" "str r5, [sp, #108]\n\t" #else "strd r4, r5, [sp, #104]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #112]\n\t" "ldr r5, [sp, #116]\n\t" #else @@ -3854,7 +4061,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #104]\n\t" "ldr r5, [sp, #108]\n\t" #else @@ -3862,14 +4069,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #104]\n\t" "str r5, [sp, #108]\n\t" #else "strd r4, r5, [sp, #104]\n\t" #endif /* Round 14 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -3889,7 +4096,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -3899,25 +4106,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else "strd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -3929,13 +4136,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #112]\n\t" "ldr r9, [sp, #116]\n\t" #else @@ -3943,7 +4150,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #112]\n\t" "ldr r7, [r3, #116]\n\t" #else @@ -3951,7 +4158,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -3959,7 +4166,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -3967,13 +4174,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #40]\n\t" "str r9, [%[sha512], #44]\n\t" #else @@ -3993,7 +4200,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -4003,19 +4210,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else "ldrd r8, r9, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -4027,7 +4234,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else @@ -4035,7 +4242,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #8]\n\t" "str r7, [%[sha512], #12]\n\t" #else @@ -4044,7 +4251,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[14] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #96]\n\t" "ldr r5, [sp, #100]\n\t" #else @@ -4065,13 +4272,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #112]\n\t" "ldr r5, [sp, #116]\n\t" #else "ldrd r4, r5, [sp, #112]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #56]\n\t" "ldr r9, [sp, #60]\n\t" #else @@ -4081,13 +4288,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #112]\n\t" "str r5, [sp, #116]\n\t" #else "strd r4, r5, [sp, #112]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #120]\n\t" "ldr r5, [sp, #124]\n\t" #else @@ -4108,7 +4315,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #112]\n\t" "ldr r5, [sp, #116]\n\t" #else @@ -4116,14 +4323,14 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #112]\n\t" "str r5, [sp, #116]\n\t" #else "strd r4, r5, [sp, #112]\n\t" #endif /* Round 15 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -4143,7 +4350,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -4153,25 +4360,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else "strd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -4183,13 +4390,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #120]\n\t" "ldr r9, [sp, #124]\n\t" #else @@ -4197,7 +4404,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #120]\n\t" "ldr r7, [r3, #124]\n\t" #else @@ -4205,7 +4412,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -4213,7 +4420,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -4221,13 +4428,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #32]\n\t" "str r9, [%[sha512], #36]\n\t" #else @@ -4247,7 +4454,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -4257,19 +4464,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else "ldrd r8, r9, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -4281,7 +4488,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else @@ -4289,7 +4496,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512]]\n\t" "str r7, [%[sha512], #4]\n\t" #else @@ -4298,7 +4505,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Calc new W[15] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #104]\n\t" "ldr r5, [sp, #108]\n\t" #else @@ -4319,13 +4526,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #26\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #120]\n\t" "ldr r5, [sp, #124]\n\t" #else "ldrd r4, r5, [sp, #120]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #64]\n\t" "ldr r9, [sp, #68]\n\t" #else @@ -4335,13 +4542,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r7\n\t" "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #120]\n\t" "str r5, [sp, #124]\n\t" #else "strd r4, r5, [sp, #120]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp]\n\t" "ldr r5, [sp, #4]\n\t" #else @@ -4362,7 +4569,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "orr r8, r8, r5, lsl #25\n\t" "eor r7, r7, r9\n\t" "eor r6, r6, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [sp, #120]\n\t" "ldr r5, [sp, #124]\n\t" #else @@ -4370,7 +4577,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #120]\n\t" "str r5, [sp, #124]\n\t" #else @@ -4380,7 +4587,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "subs r12, r12, #1\n\t" "bne L_SHA512_transform_len_start_%=\n\t" /* Round 0 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -4400,7 +4607,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -4410,25 +4617,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else "strd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -4440,13 +4647,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp]\n\t" "ldr r9, [sp, #4]\n\t" #else @@ -4454,7 +4661,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3]\n\t" "ldr r7, [r3, #4]\n\t" #else @@ -4462,7 +4669,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -4470,7 +4677,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -4478,13 +4685,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #24]\n\t" "str r9, [%[sha512], #28]\n\t" #else @@ -4504,7 +4711,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -4514,19 +4721,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else "ldrd r8, r9, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -4538,7 +4745,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else @@ -4546,7 +4753,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #56]\n\t" "str r7, [%[sha512], #60]\n\t" #else @@ -4555,7 +4762,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 1 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -4575,7 +4782,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -4585,25 +4792,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else "strd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -4615,13 +4822,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #8]\n\t" "ldr r9, [sp, #12]\n\t" #else @@ -4629,7 +4836,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #8]\n\t" "ldr r7, [r3, #12]\n\t" #else @@ -4637,7 +4844,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -4645,7 +4852,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -4653,13 +4860,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #16]\n\t" "str r9, [%[sha512], #20]\n\t" #else @@ -4679,7 +4886,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -4689,19 +4896,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else "ldrd r8, r9, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -4713,7 +4920,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else @@ -4721,7 +4928,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #48]\n\t" "str r7, [%[sha512], #52]\n\t" #else @@ -4730,7 +4937,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 2 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -4750,7 +4957,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -4760,25 +4967,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else "strd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -4790,13 +4997,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #16]\n\t" "ldr r9, [sp, #20]\n\t" #else @@ -4804,7 +5011,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #16]\n\t" "ldr r7, [r3, #20]\n\t" #else @@ -4812,7 +5019,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -4820,7 +5027,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -4828,13 +5035,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #8]\n\t" "str r9, [%[sha512], #12]\n\t" #else @@ -4854,7 +5061,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -4864,19 +5071,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else "ldrd r8, r9, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -4888,7 +5095,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else @@ -4896,7 +5103,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #40]\n\t" "str r7, [%[sha512], #44]\n\t" #else @@ -4905,7 +5112,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 3 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -4925,7 +5132,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -4935,25 +5142,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else "strd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -4965,13 +5172,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #24]\n\t" "ldr r9, [sp, #28]\n\t" #else @@ -4979,7 +5186,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #24]\n\t" "ldr r7, [r3, #28]\n\t" #else @@ -4987,7 +5194,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -4995,7 +5202,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -5003,13 +5210,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512]]\n\t" "str r9, [%[sha512], #4]\n\t" #else @@ -5029,7 +5236,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -5039,19 +5246,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else "ldrd r8, r9, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -5063,7 +5270,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else @@ -5071,7 +5278,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #32]\n\t" "str r7, [%[sha512], #36]\n\t" #else @@ -5080,7 +5287,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 4 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -5100,7 +5307,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -5110,25 +5317,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else "strd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -5140,13 +5347,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #32]\n\t" "ldr r9, [sp, #36]\n\t" #else @@ -5154,7 +5361,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #32]\n\t" "ldr r7, [r3, #36]\n\t" #else @@ -5162,7 +5369,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -5170,7 +5377,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -5178,13 +5385,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #56]\n\t" "str r9, [%[sha512], #60]\n\t" #else @@ -5204,7 +5411,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -5214,19 +5421,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else "ldrd r8, r9, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -5238,7 +5445,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else @@ -5246,7 +5453,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #24]\n\t" "str r7, [%[sha512], #28]\n\t" #else @@ -5255,7 +5462,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 5 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -5275,7 +5482,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -5285,25 +5492,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else "strd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -5315,13 +5522,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #40]\n\t" "ldr r9, [sp, #44]\n\t" #else @@ -5329,7 +5536,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #40]\n\t" "ldr r7, [r3, #44]\n\t" #else @@ -5337,7 +5544,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -5345,7 +5552,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -5353,13 +5560,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #48]\n\t" "str r9, [%[sha512], #52]\n\t" #else @@ -5379,7 +5586,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -5389,19 +5596,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else "ldrd r8, r9, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -5413,7 +5620,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else @@ -5421,7 +5628,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #16]\n\t" "str r7, [%[sha512], #20]\n\t" #else @@ -5430,7 +5637,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 6 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -5450,7 +5657,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -5460,25 +5667,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else "strd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -5490,13 +5697,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #48]\n\t" "ldr r9, [sp, #52]\n\t" #else @@ -5504,7 +5711,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #48]\n\t" "ldr r7, [r3, #52]\n\t" #else @@ -5512,7 +5719,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -5520,7 +5727,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -5528,13 +5735,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #40]\n\t" "str r9, [%[sha512], #44]\n\t" #else @@ -5554,7 +5761,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -5564,19 +5771,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else "ldrd r8, r9, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -5588,7 +5795,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else @@ -5596,7 +5803,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #8]\n\t" "str r7, [%[sha512], #12]\n\t" #else @@ -5605,7 +5812,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 7 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -5625,7 +5832,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -5635,25 +5842,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else "strd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -5665,13 +5872,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #56]\n\t" "ldr r9, [sp, #60]\n\t" #else @@ -5679,7 +5886,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #56]\n\t" "ldr r7, [r3, #60]\n\t" #else @@ -5687,7 +5894,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -5695,7 +5902,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -5703,13 +5910,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #32]\n\t" "str r9, [%[sha512], #36]\n\t" #else @@ -5729,7 +5936,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -5739,19 +5946,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else "ldrd r8, r9, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -5763,7 +5970,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else @@ -5771,7 +5978,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512]]\n\t" "str r7, [%[sha512], #4]\n\t" #else @@ -5780,7 +5987,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 8 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -5800,7 +6007,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -5810,25 +6017,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else "strd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -5840,13 +6047,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #64]\n\t" "ldr r9, [sp, #68]\n\t" #else @@ -5854,7 +6061,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #64]\n\t" "ldr r7, [r3, #68]\n\t" #else @@ -5862,7 +6069,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -5870,7 +6077,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -5878,13 +6085,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #24]\n\t" "str r9, [%[sha512], #28]\n\t" #else @@ -5904,7 +6111,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -5914,19 +6121,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else "ldrd r8, r9, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #56]\n\t" "str r5, [%[sha512], #60]\n\t" #else @@ -5938,7 +6145,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else @@ -5946,7 +6153,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #56]\n\t" "str r7, [%[sha512], #60]\n\t" #else @@ -5955,7 +6162,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 9 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -5975,7 +6182,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -5985,25 +6192,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else "strd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -6015,13 +6222,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #72]\n\t" "ldr r9, [sp, #76]\n\t" #else @@ -6029,7 +6236,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #72]\n\t" "ldr r7, [r3, #76]\n\t" #else @@ -6037,7 +6244,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -6045,7 +6252,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -6053,13 +6260,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #16]\n\t" "str r9, [%[sha512], #20]\n\t" #else @@ -6079,7 +6286,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -6089,19 +6296,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else "ldrd r8, r9, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else @@ -6113,7 +6320,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else @@ -6121,7 +6328,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #48]\n\t" "str r7, [%[sha512], #52]\n\t" #else @@ -6130,7 +6337,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 10 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -6150,7 +6357,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -6160,25 +6367,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else "strd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -6190,13 +6397,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #80]\n\t" "ldr r9, [sp, #84]\n\t" #else @@ -6204,7 +6411,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #80]\n\t" "ldr r7, [r3, #84]\n\t" #else @@ -6212,7 +6419,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -6220,7 +6427,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -6228,13 +6435,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #8]\n\t" "str r9, [%[sha512], #12]\n\t" #else @@ -6254,7 +6461,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -6264,19 +6471,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else "ldrd r8, r9, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #40]\n\t" "str r5, [%[sha512], #44]\n\t" #else @@ -6288,7 +6495,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else @@ -6296,7 +6503,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #40]\n\t" "str r7, [%[sha512], #44]\n\t" #else @@ -6305,7 +6512,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 11 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -6325,7 +6532,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -6335,25 +6542,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else "strd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else @@ -6365,13 +6572,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #88]\n\t" "ldr r9, [sp, #92]\n\t" #else @@ -6379,7 +6586,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #88]\n\t" "ldr r7, [r3, #92]\n\t" #else @@ -6387,7 +6594,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -6395,7 +6602,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -6403,13 +6610,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512]]\n\t" "str r9, [%[sha512], #4]\n\t" #else @@ -6429,7 +6636,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else @@ -6439,19 +6646,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else "ldrd r8, r9, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else @@ -6463,7 +6670,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else @@ -6471,7 +6678,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #32]\n\t" "str r7, [%[sha512], #36]\n\t" #else @@ -6480,7 +6687,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 12 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -6500,7 +6707,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -6510,25 +6717,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else "strd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else @@ -6540,13 +6747,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #96]\n\t" "ldr r9, [sp, #100]\n\t" #else @@ -6554,7 +6761,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #96]\n\t" "ldr r7, [r3, #100]\n\t" #else @@ -6562,7 +6769,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -6570,7 +6777,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -6578,13 +6785,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #56]\n\t" "str r9, [%[sha512], #60]\n\t" #else @@ -6604,7 +6811,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else @@ -6614,19 +6821,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else "ldrd r8, r9, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #24]\n\t" "str r5, [%[sha512], #28]\n\t" #else @@ -6638,7 +6845,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else @@ -6646,7 +6853,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #24]\n\t" "str r7, [%[sha512], #28]\n\t" #else @@ -6655,7 +6862,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 13 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else @@ -6675,7 +6882,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -6685,25 +6892,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else "strd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #56]\n\t" "ldr r5, [%[sha512], #60]\n\t" #else "ldrd r4, r5, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else "ldrd r6, r7, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else @@ -6715,13 +6922,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #104]\n\t" "ldr r9, [sp, #108]\n\t" #else @@ -6729,7 +6936,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #104]\n\t" "ldr r7, [r3, #108]\n\t" #else @@ -6737,7 +6944,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #48]\n\t" "ldr r9, [%[sha512], #52]\n\t" #else @@ -6745,7 +6952,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -6753,13 +6960,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #24]\n\t" "ldr r5, [%[sha512], #28]\n\t" #else "ldrd r4, r5, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #48]\n\t" "str r9, [%[sha512], #52]\n\t" #else @@ -6779,7 +6986,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else @@ -6789,19 +6996,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #24]\n\t" "ldr r9, [%[sha512], #28]\n\t" #else "ldrd r8, r9, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #32]\n\t" "ldr r7, [%[sha512], #36]\n\t" #else "ldrd r6, r7, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else @@ -6813,7 +7020,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else @@ -6821,7 +7028,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #16]\n\t" "str r7, [%[sha512], #20]\n\t" #else @@ -6830,7 +7037,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 14 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else @@ -6850,7 +7057,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -6860,25 +7067,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else "strd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512]]\n\t" "ldr r9, [%[sha512], #4]\n\t" #else @@ -6890,13 +7097,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #112]\n\t" "ldr r9, [sp, #116]\n\t" #else @@ -6904,7 +7111,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #112]\n\t" "ldr r7, [r3, #116]\n\t" #else @@ -6912,7 +7119,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #40]\n\t" "ldr r9, [%[sha512], #44]\n\t" #else @@ -6920,7 +7127,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -6928,13 +7135,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #40]\n\t" "str r9, [%[sha512], #44]\n\t" #else @@ -6954,7 +7161,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else @@ -6964,19 +7171,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #16]\n\t" "ldr r9, [%[sha512], #20]\n\t" #else "ldrd r8, r9, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #8]\n\t" "str r5, [%[sha512], #12]\n\t" #else @@ -6988,7 +7195,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else @@ -6996,7 +7203,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #8]\n\t" "str r7, [%[sha512], #12]\n\t" #else @@ -7005,7 +7212,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Round 15 */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else @@ -7025,7 +7232,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #23\n\t" "orr r9, r9, r4, lsr #9\n\t" "orr r8, r8, r5, lsr #9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -7035,25 +7242,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else "strd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #40]\n\t" "ldr r5, [%[sha512], #44]\n\t" #else "ldrd r4, r5, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #48]\n\t" "ldr r7, [%[sha512], #52]\n\t" #else "ldrd r6, r7, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #56]\n\t" "ldr r9, [%[sha512], #60]\n\t" #else @@ -7065,13 +7272,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r7, r7, r5\n\t" "eor r6, r6, r8\n\t" "eor r7, r7, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #120]\n\t" "ldr r9, [sp, #124]\n\t" #else @@ -7079,7 +7286,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [r3, #120]\n\t" "ldr r7, [r3, #124]\n\t" #else @@ -7087,7 +7294,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r8\n\t" "adc r5, r5, r9\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #32]\n\t" "ldr r9, [%[sha512], #36]\n\t" #else @@ -7095,7 +7302,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -7103,13 +7310,13 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r8, r8, r4\n\t" "adc r9, r9, r5\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #8]\n\t" "ldr r5, [%[sha512], #12]\n\t" #else "ldrd r4, r5, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[sha512], #32]\n\t" "str r9, [%[sha512], #36]\n\t" #else @@ -7129,7 +7336,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "lsls r9, r5, #25\n\t" "orr r9, r9, r4, lsr #7\n\t" "orr r8, r8, r5, lsr #7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else @@ -7139,19 +7346,19 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "eor r7, r7, r9\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[sha512], #8]\n\t" "ldr r9, [%[sha512], #12]\n\t" #else "ldrd r8, r9, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #16]\n\t" "ldr r7, [%[sha512], #20]\n\t" #else "ldrd r6, r7, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else @@ -7163,7 +7370,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "and r11, r11, r9\n\t" "eor r10, r10, r6\n\t" "eor r11, r11, r7\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512]]\n\t" "ldr r7, [%[sha512], #4]\n\t" #else @@ -7171,7 +7378,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512]]\n\t" "str r7, [%[sha512], #4]\n\t" #else @@ -7180,25 +7387,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "mov r10, r8\n\t" "mov r11, r9\n\t" /* Add in digest from start */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512]]\n\t" "ldr r5, [%[sha512], #4]\n\t" #else "ldrd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #8]\n\t" "ldr r7, [%[sha512], #12]\n\t" #else "ldrd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #128]\n\t" "ldr r9, [sp, #132]\n\t" #else "ldrd r8, r9, [sp, #128]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #136]\n\t" "ldr r11, [sp, #140]\n\t" #else @@ -7208,49 +7415,49 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r9\n\t" "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512]]\n\t" "str r5, [%[sha512], #4]\n\t" #else "strd r4, r5, [%[sha512]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #8]\n\t" "str r7, [%[sha512], #12]\n\t" #else "strd r6, r7, [%[sha512], #8]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #128]\n\t" "str r5, [sp, #132]\n\t" #else "strd r4, r5, [sp, #128]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #136]\n\t" "str r7, [sp, #140]\n\t" #else "strd r6, r7, [sp, #136]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #16]\n\t" "ldr r5, [%[sha512], #20]\n\t" #else "ldrd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #24]\n\t" "ldr r7, [%[sha512], #28]\n\t" #else "ldrd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #144]\n\t" "ldr r9, [sp, #148]\n\t" #else "ldrd r8, r9, [sp, #144]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #152]\n\t" "ldr r11, [sp, #156]\n\t" #else @@ -7260,49 +7467,49 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r9\n\t" "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #16]\n\t" "str r5, [%[sha512], #20]\n\t" #else "strd r4, r5, [%[sha512], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #24]\n\t" "str r7, [%[sha512], #28]\n\t" #else "strd r6, r7, [%[sha512], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #144]\n\t" "str r5, [sp, #148]\n\t" #else "strd r4, r5, [sp, #144]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #152]\n\t" "str r7, [sp, #156]\n\t" #else "strd r6, r7, [sp, #152]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #32]\n\t" "ldr r5, [%[sha512], #36]\n\t" #else "ldrd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #40]\n\t" "ldr r7, [%[sha512], #44]\n\t" #else "ldrd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #160]\n\t" "ldr r9, [sp, #164]\n\t" #else "ldrd r8, r9, [sp, #160]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #168]\n\t" "ldr r11, [sp, #172]\n\t" #else @@ -7312,49 +7519,49 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r9\n\t" "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #32]\n\t" "str r5, [%[sha512], #36]\n\t" #else "strd r4, r5, [%[sha512], #32]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #40]\n\t" "str r7, [%[sha512], #44]\n\t" #else "strd r6, r7, [%[sha512], #40]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #160]\n\t" "str r5, [sp, #164]\n\t" #else "strd r4, r5, [sp, #160]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #168]\n\t" "str r7, [sp, #172]\n\t" #else "strd r6, r7, [sp, #168]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[sha512], #48]\n\t" "ldr r5, [%[sha512], #52]\n\t" #else "ldrd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[sha512], #56]\n\t" "ldr r7, [%[sha512], #60]\n\t" #else "ldrd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [sp, #176]\n\t" "ldr r9, [sp, #180]\n\t" #else "ldrd r8, r9, [sp, #176]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r10, [sp, #184]\n\t" "ldr r11, [sp, #188]\n\t" #else @@ -7364,25 +7571,25 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "adc r5, r5, r9\n\t" "adds r6, r6, r10\n\t" "adc r7, r7, r11\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[sha512], #48]\n\t" "str r5, [%[sha512], #52]\n\t" #else "strd r4, r5, [%[sha512], #48]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[sha512], #56]\n\t" "str r7, [%[sha512], #60]\n\t" #else "strd r6, r7, [%[sha512], #56]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [sp, #176]\n\t" "str r5, [sp, #180]\n\t" #else "strd r4, r5, [sp, #176]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [sp, #184]\n\t" "str r7, [sp, #188]\n\t" #else @@ -8954,4 +9161,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif /* WOLFSSL_SHA512 */ #endif /* !__aarch64__ && !__thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index 6978d9d3e..3f04ce87a 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -593,9 +593,9 @@ _fe_mul: adds x7, x7, x3 umulh x4, x15, x19 adcs x8, x8, x4 - adc x9, x9, xzr # A[1] * B[3] umulh x11, x15, x22 + adc x9, x9, xzr mul x10, x15, x22 # A[0] * B[1] mul x3, x14, x20 @@ -1842,9 +1842,9 @@ L_curve25519_bits: adds x20, x20, x3 umulh x4, x15, x6 adcs x21, x21, x4 - adc x22, x22, xzr # A[1] * B[3] umulh x26, x15, x9 + adc x22, x22, xzr mul x25, x15, x9 # A[0] * B[1] mul x3, x14, x7 @@ -1958,9 +1958,9 @@ L_curve25519_bits: adds x20, x20, x3 umulh x4, x11, x25 adcs x21, x21, x4 - adc x22, x22, xzr # A[1] * B[3] umulh x15, x11, x28 + adc x22, x22, xzr mul x14, x11, x28 # A[0] * B[1] mul x3, x10, x26 @@ -2229,9 +2229,9 @@ L_curve25519_bits: adds x7, x7, x3 umulh x4, x15, x10 adcs x8, x8, x4 - adc x9, x9, xzr # A[1] * B[3] umulh x26, x15, x13 + adc x9, x9, xzr mul x25, x15, x13 # A[0] * B[1] mul x3, x14, x11 @@ -2400,9 +2400,9 @@ L_curve25519_bits: adds x7, x7, x3 umulh x4, x15, x10 adcs x8, x8, x4 - adc x9, x9, xzr # A[1] * B[3] umulh x26, x15, x13 + adc x9, x9, xzr mul x25, x15, x13 # A[0] * B[1] mul x3, x14, x11 @@ -2708,9 +2708,9 @@ L_curve25519_bits: adds x11, x11, x3 umulh x4, x20, x14 adcs x12, x12, x4 - adc x13, x13, xzr # A[1] * B[3] umulh x26, x20, x17 + adc x13, x13, xzr mul x25, x20, x17 # A[0] * B[1] mul x3, x19, x15 @@ -3679,9 +3679,9 @@ L_curve25519_inv_8: adds x15, x15, x3 umulh x4, x7, x10 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x7, x13 + adc x17, x17, xzr mul x19, x7, x13 # A[0] * B[1] mul x3, x6, x11 @@ -4664,9 +4664,9 @@ _ge_p1p1_to_p2: adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x11, x9 + adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 @@ -4782,9 +4782,9 @@ _ge_p1p1_to_p2: adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x11, x9 + adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 @@ -4900,9 +4900,9 @@ _ge_p1p1_to_p2: adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x11, x9 + adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 @@ -5051,9 +5051,9 @@ _ge_p1p1_to_p3: adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x11, x9 + adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 @@ -5169,9 +5169,9 @@ _ge_p1p1_to_p3: adds x15, x15, x3 umulh x4, x24, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x24, x9 + adc x17, x17, xzr mul x19, x24, x9 # A[0] * B[1] mul x3, x23, x7 @@ -5287,9 +5287,9 @@ _ge_p1p1_to_p3: adds x15, x15, x3 umulh x4, x24, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x24, x9 + adc x17, x17, xzr mul x19, x24, x9 # A[0] * B[1] mul x3, x23, x7 @@ -5403,9 +5403,9 @@ _ge_p1p1_to_p3: adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 - adc x17, x17, xzr # A[1] * B[3] umulh x20, x11, x9 + adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 @@ -6075,9 +6075,9 @@ _ge_madd: adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 - adc x24, x24, xzr # A[1] * B[3] umulh x5, x17, x11 + adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 @@ -6191,9 +6191,9 @@ _ge_madd: adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 - adc x7, x7, xzr # A[1] * B[3] umulh x9, x13, x20 + adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 @@ -6348,9 +6348,9 @@ _ge_madd: adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 - adc x20, x20, xzr # A[1] * B[3] umulh x9, x22, x7 + adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 @@ -6593,9 +6593,9 @@ _ge_msub: adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 - adc x24, x24, xzr # A[1] * B[3] umulh x5, x17, x11 + adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 @@ -6709,9 +6709,9 @@ _ge_msub: adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 - adc x7, x7, xzr # A[1] * B[3] umulh x9, x13, x20 + adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 @@ -6866,9 +6866,9 @@ _ge_msub: adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 - adc x20, x20, xzr # A[1] * B[3] umulh x9, x22, x7 + adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 @@ -7109,9 +7109,9 @@ _ge_add: adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 - adc x24, x24, xzr # A[1] * B[3] umulh x5, x17, x11 + adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 @@ -7228,9 +7228,9 @@ _ge_add: adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 - adc x7, x7, xzr # A[1] * B[3] umulh x9, x13, x20 + adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 @@ -7388,9 +7388,9 @@ _ge_add: adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 - adc x20, x20, xzr # A[1] * B[3] umulh x9, x22, x7 + adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 @@ -7509,9 +7509,9 @@ _ge_add: adds x9, x9, x25 umulh x26, x5, x12 adcs x10, x10, x26 - adc x11, x11, xzr # A[1] * B[3] umulh x17, x5, x15 + adc x11, x11, xzr mul x16, x5, x15 # A[0] * B[1] mul x25, x4, x13 @@ -7753,9 +7753,9 @@ _ge_sub: adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 - adc x24, x24, xzr # A[1] * B[3] umulh x5, x17, x11 + adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 @@ -7880,9 +7880,9 @@ _ge_sub: adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 - adc x7, x7, xzr # A[1] * B[3] umulh x9, x13, x20 + adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 @@ -8040,9 +8040,9 @@ _ge_sub: adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 - adc x20, x20, xzr # A[1] * B[3] umulh x9, x22, x7 + adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 @@ -8169,9 +8169,9 @@ _ge_sub: adds x9, x9, x25 umulh x26, x5, x12 adcs x10, x10, x26 - adc x11, x11, xzr # A[1] * B[3] umulh x17, x5, x15 + adc x11, x11, xzr mul x16, x5, x15 # A[0] * B[1] mul x25, x4, x13 @@ -8570,9 +8570,9 @@ _sc_muladd: adds x5, x5, x21 umulh x22, x13, x16 adcs x6, x6, x22 - adc x7, x7, xzr # A[1] * B[3] umulh x9, x13, x20 + adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x21, x12, x17 diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c index cbfb69b7e..f58b4365c 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c @@ -490,9 +490,9 @@ void fe_mul(fe r, const fe a, const fe b) "adds x7, x7, x3\n\t" "umulh x4, x15, x19\n\t" "adcs x8, x8, x4\n\t" - "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ "umulh x11, x15, x22\n\t" + "adc x9, x9, xzr\n\t" "mul x10, x15, x22\n\t" /* A[0] * B[1] */ "mul x3, x14, x20\n\t" @@ -1711,9 +1711,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x20, x20, x3\n\t" "umulh x4, x15, x6\n\t" "adcs x21, x21, x4\n\t" - "adc x22, x22, xzr\n\t" /* A[1] * B[3] */ "umulh x26, x15, x9\n\t" + "adc x22, x22, xzr\n\t" "mul x25, x15, x9\n\t" /* A[0] * B[1] */ "mul x3, x14, x7\n\t" @@ -1827,9 +1827,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x20, x20, x3\n\t" "umulh x4, x11, x25\n\t" "adcs x21, x21, x4\n\t" - "adc x22, x22, xzr\n\t" /* A[1] * B[3] */ "umulh x15, x11, x28\n\t" + "adc x22, x22, xzr\n\t" "mul x14, x11, x28\n\t" /* A[0] * B[1] */ "mul x3, x10, x26\n\t" @@ -2098,9 +2098,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x7, x7, x3\n\t" "umulh x4, x15, x10\n\t" "adcs x8, x8, x4\n\t" - "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ "umulh x26, x15, x13\n\t" + "adc x9, x9, xzr\n\t" "mul x25, x15, x13\n\t" /* A[0] * B[1] */ "mul x3, x14, x11\n\t" @@ -2269,9 +2269,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x7, x7, x3\n\t" "umulh x4, x15, x10\n\t" "adcs x8, x8, x4\n\t" - "adc x9, x9, xzr\n\t" /* A[1] * B[3] */ "umulh x26, x15, x13\n\t" + "adc x9, x9, xzr\n\t" "mul x25, x15, x13\n\t" /* A[0] * B[1] */ "mul x3, x14, x11\n\t" @@ -2577,9 +2577,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x11, x11, x3\n\t" "umulh x4, x20, x14\n\t" "adcs x12, x12, x4\n\t" - "adc x13, x13, xzr\n\t" /* A[1] * B[3] */ "umulh x26, x20, x17\n\t" + "adc x13, x13, xzr\n\t" "mul x25, x20, x17\n\t" /* A[0] * B[1] */ "mul x3, x19, x15\n\t" @@ -3556,9 +3556,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "adds x15, x15, x3\n\t" "umulh x4, x7, x10\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x7, x13\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x7, x13\n\t" /* A[0] * B[1] */ "mul x3, x6, x11\n\t" @@ -4520,9 +4520,9 @@ void ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x11, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x11, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x11, x9\n\t" /* A[0] * B[1] */ "mul x3, x10, x7\n\t" @@ -4638,9 +4638,9 @@ void ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x11, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x11, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x11, x9\n\t" /* A[0] * B[1] */ "mul x3, x10, x7\n\t" @@ -4756,9 +4756,9 @@ void ge_p1p1_to_p2(ge_p2* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x11, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x11, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x11, x9\n\t" /* A[0] * B[1] */ "mul x3, x10, x7\n\t" @@ -4890,9 +4890,9 @@ void ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x11, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x11, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x11, x9\n\t" /* A[0] * B[1] */ "mul x3, x10, x7\n\t" @@ -5008,9 +5008,9 @@ void ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x24, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x24, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x24, x9\n\t" /* A[0] * B[1] */ "mul x3, x23, x7\n\t" @@ -5126,9 +5126,9 @@ void ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x24, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x24, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x24, x9\n\t" /* A[0] * B[1] */ "mul x3, x23, x7\n\t" @@ -5242,9 +5242,9 @@ void ge_p1p1_to_p3(ge_p3* r, const ge_p1p1* p) "adds x15, x15, x3\n\t" "umulh x4, x11, x6\n\t" "adcs x16, x16, x4\n\t" - "adc x17, x17, xzr\n\t" /* A[1] * B[3] */ "umulh x20, x11, x9\n\t" + "adc x17, x17, xzr\n\t" "mul x19, x11, x9\n\t" /* A[0] * B[1] */ "mul x3, x10, x7\n\t" @@ -5873,9 +5873,9 @@ void ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x22, x22, x25\n\t" "umulh x26, x17, x8\n\t" "adcs x23, x23, x26\n\t" - "adc x24, x24, xzr\n\t" /* A[1] * B[3] */ "umulh x5, x17, x11\n\t" + "adc x24, x24, xzr\n\t" "mul x4, x17, x11\n\t" /* A[0] * B[1] */ "mul x25, x16, x9\n\t" @@ -5989,9 +5989,9 @@ void ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" "adcs x6, x6, x26\n\t" - "adc x7, x7, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" "mul x8, x13, x20\n\t" /* A[0] * B[1] */ "mul x25, x12, x17\n\t" @@ -6146,9 +6146,9 @@ void ge_madd(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x17, x17, x25\n\t" "umulh x26, x22, x4\n\t" "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x22, x7\n\t" + "adc x20, x20, xzr\n\t" "mul x8, x22, x7\n\t" /* A[0] * B[1] */ "mul x25, x21, x5\n\t" @@ -6370,9 +6370,9 @@ void ge_msub(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x22, x22, x25\n\t" "umulh x26, x17, x8\n\t" "adcs x23, x23, x26\n\t" - "adc x24, x24, xzr\n\t" /* A[1] * B[3] */ "umulh x5, x17, x11\n\t" + "adc x24, x24, xzr\n\t" "mul x4, x17, x11\n\t" /* A[0] * B[1] */ "mul x25, x16, x9\n\t" @@ -6486,9 +6486,9 @@ void ge_msub(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" "adcs x6, x6, x26\n\t" - "adc x7, x7, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" "mul x8, x13, x20\n\t" /* A[0] * B[1] */ "mul x25, x12, x17\n\t" @@ -6643,9 +6643,9 @@ void ge_msub(ge_p1p1* r, const ge_p3* p, const ge_precomp* q) "adds x17, x17, x25\n\t" "umulh x26, x22, x4\n\t" "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x22, x7\n\t" + "adc x20, x20, xzr\n\t" "mul x8, x22, x7\n\t" /* A[0] * B[1] */ "mul x25, x21, x5\n\t" @@ -6865,9 +6865,9 @@ void ge_add(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x22, x22, x25\n\t" "umulh x26, x17, x8\n\t" "adcs x23, x23, x26\n\t" - "adc x24, x24, xzr\n\t" /* A[1] * B[3] */ "umulh x5, x17, x11\n\t" + "adc x24, x24, xzr\n\t" "mul x4, x17, x11\n\t" /* A[0] * B[1] */ "mul x25, x16, x9\n\t" @@ -6984,9 +6984,9 @@ void ge_add(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" "adcs x6, x6, x26\n\t" - "adc x7, x7, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" "mul x8, x13, x20\n\t" /* A[0] * B[1] */ "mul x25, x12, x17\n\t" @@ -7144,9 +7144,9 @@ void ge_add(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x17, x17, x25\n\t" "umulh x26, x22, x4\n\t" "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x22, x7\n\t" + "adc x20, x20, xzr\n\t" "mul x8, x22, x7\n\t" /* A[0] * B[1] */ "mul x25, x21, x5\n\t" @@ -7265,9 +7265,9 @@ void ge_add(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x9, x9, x25\n\t" "umulh x26, x5, x12\n\t" "adcs x10, x10, x26\n\t" - "adc x11, x11, xzr\n\t" /* A[1] * B[3] */ "umulh x17, x5, x15\n\t" + "adc x11, x11, xzr\n\t" "mul x16, x5, x15\n\t" /* A[0] * B[1] */ "mul x25, x4, x13\n\t" @@ -7488,9 +7488,9 @@ void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x22, x22, x25\n\t" "umulh x26, x17, x8\n\t" "adcs x23, x23, x26\n\t" - "adc x24, x24, xzr\n\t" /* A[1] * B[3] */ "umulh x5, x17, x11\n\t" + "adc x24, x24, xzr\n\t" "mul x4, x17, x11\n\t" /* A[0] * B[1] */ "mul x25, x16, x9\n\t" @@ -7615,9 +7615,9 @@ void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x5, x5, x25\n\t" "umulh x26, x13, x16\n\t" "adcs x6, x6, x26\n\t" - "adc x7, x7, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" "mul x8, x13, x20\n\t" /* A[0] * B[1] */ "mul x25, x12, x17\n\t" @@ -7775,9 +7775,9 @@ void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x17, x17, x25\n\t" "umulh x26, x22, x4\n\t" "adcs x19, x19, x26\n\t" - "adc x20, x20, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x22, x7\n\t" + "adc x20, x20, xzr\n\t" "mul x8, x22, x7\n\t" /* A[0] * B[1] */ "mul x25, x21, x5\n\t" @@ -7904,9 +7904,9 @@ void ge_sub(ge_p1p1* r, const ge_p3* p, const ge_cached* q) "adds x9, x9, x25\n\t" "umulh x26, x5, x12\n\t" "adcs x10, x10, x26\n\t" - "adc x11, x11, xzr\n\t" /* A[1] * B[3] */ "umulh x17, x5, x15\n\t" + "adc x11, x11, xzr\n\t" "mul x16, x5, x15\n\t" /* A[0] * B[1] */ "mul x25, x4, x13\n\t" @@ -8265,9 +8265,9 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) "adds x5, x5, x21\n\t" "umulh x22, x13, x16\n\t" "adcs x6, x6, x22\n\t" - "adc x7, x7, xzr\n\t" /* A[1] * B[3] */ "umulh x9, x13, x20\n\t" + "adc x7, x7, xzr\n\t" "mul x8, x13, x20\n\t" /* A[0] * B[1] */ "mul x21, x12, x17\n\t" diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S index 2df54f6e7..f483f87de 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S @@ -595,7 +595,11 @@ L_AES_invert_key_loop: STM r0!, {r6, r7, r8, r9} SUBS r11, r11, #0x2 SUB r10, r10, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_invert_key_loop +#else + BNE.N L_AES_invert_key_loop +#endif SUB r0, r0, r1, LSL #3 ADD r0, r0, #0x10 SUB r11, r1, #0x1 @@ -666,7 +670,11 @@ L_AES_invert_key_mix_loop: EOR r8, r8, r9, ROR #24 STR r8, [r0], #4 SUBS r11, r11, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_invert_key_mix_loop +#else + BNE.N L_AES_invert_key_mix_loop +#endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 165 .size AES_invert_key,.-AES_invert_key @@ -695,9 +703,17 @@ AES_set_encrypt_key: LDR r8, L_AES_Thumb2_te ADR lr, L_AES_Thumb2_rcon CMP r1, #0x80 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_set_encrypt_key_start_128 +#else + BEQ.N L_AES_set_encrypt_key_start_128 +#endif CMP r1, #0xc0 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_set_encrypt_key_start_192 +#else + BEQ.N L_AES_set_encrypt_key_start_192 +#endif LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] REV r4, r4 @@ -757,7 +773,11 @@ L_AES_set_encrypt_key_loop_256: STM r2, {r4, r5, r6, r7} SUB r2, r2, #0x10 SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_set_encrypt_key_loop_256 +#else + BNE.N L_AES_set_encrypt_key_loop_256 +#endif UBFX r4, r7, #0, #8 UBFX r5, r7, #8, #8 UBFX r6, r7, #16, #8 @@ -817,7 +837,11 @@ L_AES_set_encrypt_key_loop_192: EOR r7, r7, r6 STM r2, {r0, r1, r4, r5, r6, r7} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_set_encrypt_key_loop_192 +#else + BNE.N L_AES_set_encrypt_key_loop_192 +#endif UBFX r0, r7, #0, #8 UBFX r1, r7, #8, #8 UBFX r4, r7, #16, #8 @@ -868,7 +892,11 @@ L_AES_set_encrypt_key_loop_128: EOR r7, r7, r6 STM r2, {r4, r5, r6, r7} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_set_encrypt_key_loop_128 +#else + BNE.N L_AES_set_encrypt_key_loop_128 +#endif L_AES_set_encrypt_key_end: POP {r4, r5, r6, r7, r8, pc} # Cycle Count = 327 @@ -981,7 +1009,11 @@ L_AES_encrypt_block_nr: EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_encrypt_block_nr +#else + BNE.N L_AES_encrypt_block_nr +#endif UBFX r8, r5, #16, #8 LSR r11, r4, #24 UBFX lr, r6, #8, #8 @@ -1105,9 +1137,17 @@ AES_ECB_encrypt: LDR r12, [sp, #36] PUSH {r3} CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_ECB_encrypt_start_block_128 +#else + BEQ.N L_AES_ECB_encrypt_start_block_128 +#endif CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_ECB_encrypt_start_block_192 +#else + BEQ.N L_AES_ECB_encrypt_start_block_192 +#endif L_AES_ECB_encrypt_loop_block_256: LDR r4, [lr] LDR r5, [lr, #4] @@ -1139,7 +1179,11 @@ L_AES_ECB_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_encrypt_loop_block_256 +#else + BNE.N L_AES_ECB_encrypt_loop_block_256 +#endif B L_AES_ECB_encrypt_end L_AES_ECB_encrypt_start_block_192: L_AES_ECB_encrypt_loop_block_192: @@ -1173,7 +1217,11 @@ L_AES_ECB_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_encrypt_loop_block_192 +#else + BNE.N L_AES_ECB_encrypt_loop_block_192 +#endif B L_AES_ECB_encrypt_end L_AES_ECB_encrypt_start_block_128: L_AES_ECB_encrypt_loop_block_128: @@ -1207,7 +1255,11 @@ L_AES_ECB_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_encrypt_loop_block_128 +#else + BNE.N L_AES_ECB_encrypt_loop_block_128 +#endif L_AES_ECB_encrypt_end: POP {r3} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1228,9 +1280,17 @@ AES_CBC_encrypt: LDM r9, {r4, r5, r6, r7} PUSH {r3, r9} CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_encrypt_start_block_128 +#else + BEQ.N L_AES_CBC_encrypt_start_block_128 +#endif CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_encrypt_start_block_192 +#else + BEQ.N L_AES_CBC_encrypt_start_block_192 +#endif L_AES_CBC_encrypt_loop_block_256: LDR r8, [lr] LDR r9, [lr, #4] @@ -1266,7 +1326,11 @@ L_AES_CBC_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_encrypt_loop_block_256 +#else + BNE.N L_AES_CBC_encrypt_loop_block_256 +#endif B L_AES_CBC_encrypt_end L_AES_CBC_encrypt_start_block_192: L_AES_CBC_encrypt_loop_block_192: @@ -1304,7 +1368,11 @@ L_AES_CBC_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_encrypt_loop_block_192 +#else + BNE.N L_AES_CBC_encrypt_loop_block_192 +#endif B L_AES_CBC_encrypt_end L_AES_CBC_encrypt_start_block_128: L_AES_CBC_encrypt_loop_block_128: @@ -1342,7 +1410,11 @@ L_AES_CBC_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_encrypt_loop_block_128 +#else + BNE.N L_AES_CBC_encrypt_loop_block_128 +#endif L_AES_CBC_encrypt_end: POP {r3, r9} STM r9, {r4, r5, r6, r7} @@ -1369,9 +1441,17 @@ AES_CTR_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CTR_encrypt_start_block_128 +#else + BEQ.N L_AES_CTR_encrypt_start_block_128 +#endif CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CTR_encrypt_start_block_192 +#else + BEQ.N L_AES_CTR_encrypt_start_block_192 +#endif L_AES_CTR_encrypt_loop_block_256: PUSH {r1, r2, lr} LDR lr, [sp, #16] @@ -1411,7 +1491,11 @@ L_AES_CTR_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CTR_encrypt_loop_block_256 +#else + BNE.N L_AES_CTR_encrypt_loop_block_256 +#endif B L_AES_CTR_encrypt_end L_AES_CTR_encrypt_start_block_192: L_AES_CTR_encrypt_loop_block_192: @@ -1453,7 +1537,11 @@ L_AES_CTR_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CTR_encrypt_loop_block_192 +#else + BNE.N L_AES_CTR_encrypt_loop_block_192 +#endif B L_AES_CTR_encrypt_end L_AES_CTR_encrypt_start_block_128: L_AES_CTR_encrypt_loop_block_128: @@ -1495,7 +1583,11 @@ L_AES_CTR_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CTR_encrypt_loop_block_128 +#else + BNE.N L_AES_CTR_encrypt_loop_block_128 +#endif L_AES_CTR_encrypt_end: POP {r3, r8} REV r4, r4 @@ -1617,7 +1709,11 @@ L_AES_decrypt_block_nr: EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_decrypt_block_nr +#else + BNE.N L_AES_decrypt_block_nr +#endif UBFX r8, r7, #16, #8 LSR r11, r4, #24 UBFX r12, r6, #8, #8 @@ -2001,9 +2097,17 @@ AES_ECB_decrypt: MOV r12, r2 ADR r2, L_AES_Thumb2_td4 CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_ECB_decrypt_start_block_128 +#else + BEQ.N L_AES_ECB_decrypt_start_block_128 +#endif CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_ECB_decrypt_start_block_192 +#else + BEQ.N L_AES_ECB_decrypt_start_block_192 +#endif L_AES_ECB_decrypt_loop_block_256: LDR r4, [lr] LDR r5, [lr, #4] @@ -2034,7 +2138,11 @@ L_AES_ECB_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_decrypt_loop_block_256 +#else + BNE.N L_AES_ECB_decrypt_loop_block_256 +#endif B L_AES_ECB_decrypt_end L_AES_ECB_decrypt_start_block_192: L_AES_ECB_decrypt_loop_block_192: @@ -2067,7 +2175,11 @@ L_AES_ECB_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_decrypt_loop_block_192 +#else + BNE.N L_AES_ECB_decrypt_loop_block_192 +#endif B L_AES_ECB_decrypt_end L_AES_ECB_decrypt_start_block_128: L_AES_ECB_decrypt_loop_block_128: @@ -2100,7 +2212,11 @@ L_AES_ECB_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_ECB_decrypt_loop_block_128 +#else + BNE.N L_AES_ECB_decrypt_loop_block_128 +#endif L_AES_ECB_decrypt_end: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 210 @@ -2121,9 +2237,17 @@ AES_CBC_decrypt: ADR r2, L_AES_Thumb2_td4 PUSH {r3, r4} CMP r8, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_decrypt_loop_block_128 +#else + BEQ.N L_AES_CBC_decrypt_loop_block_128 +#endif CMP r8, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_decrypt_loop_block_192 +#else + BEQ.N L_AES_CBC_decrypt_loop_block_192 +#endif L_AES_CBC_decrypt_loop_block_256: PUSH {r1, r12, lr} LDR r4, [lr] @@ -2164,7 +2288,11 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.N L_AES_CBC_decrypt_end_odd +#endif PUSH {r1, r12, lr} LDR r4, [lr] LDR r5, [lr, #4] @@ -2205,7 +2333,11 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_decrypt_loop_block_256 +#else + BNE.N L_AES_CBC_decrypt_loop_block_256 +#endif B L_AES_CBC_decrypt_end L_AES_CBC_decrypt_loop_block_192: PUSH {r1, r12, lr} @@ -2247,7 +2379,11 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.N L_AES_CBC_decrypt_end_odd +#endif PUSH {r1, r12, lr} LDR r4, [lr] LDR r5, [lr, #4] @@ -2288,7 +2424,11 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_decrypt_loop_block_192 +#else + BNE.N L_AES_CBC_decrypt_loop_block_192 +#endif B L_AES_CBC_decrypt_end L_AES_CBC_decrypt_loop_block_128: PUSH {r1, r12, lr} @@ -2330,7 +2470,11 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_CBC_decrypt_end_odd +#else + BEQ.N L_AES_CBC_decrypt_end_odd +#endif PUSH {r1, r12, lr} LDR r4, [lr] LDR r5, [lr, #4] @@ -2371,7 +2515,11 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_CBC_decrypt_loop_block_128 +#else + BNE.N L_AES_CBC_decrypt_loop_block_128 +#endif B L_AES_CBC_decrypt_end L_AES_CBC_decrypt_end_odd: LDR r4, [sp, #4] @@ -2961,7 +3109,11 @@ L_GCM_gmult_len_start_block: POP {r3} SUBS r3, r3, #0x10 ADD r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_GCM_gmult_len_start_block +#else + BNE.N L_GCM_gmult_len_start_block +#endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 742 .size GCM_gmult_len,.-GCM_gmult_len @@ -2989,9 +3141,17 @@ AES_GCM_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_GCM_encrypt_start_block_128 +#else + BEQ.N L_AES_GCM_encrypt_start_block_128 +#endif CMP r12, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BEQ L_AES_GCM_encrypt_start_block_192 +#else + BEQ.N L_AES_GCM_encrypt_start_block_192 +#endif L_AES_GCM_encrypt_loop_block_256: PUSH {r1, r2, lr} LDR lr, [sp, #16] @@ -3028,7 +3188,11 @@ L_AES_GCM_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_GCM_encrypt_loop_block_256 +#else + BNE.N L_AES_GCM_encrypt_loop_block_256 +#endif B L_AES_GCM_encrypt_end L_AES_GCM_encrypt_start_block_192: L_AES_GCM_encrypt_loop_block_192: @@ -3067,7 +3231,11 @@ L_AES_GCM_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_GCM_encrypt_loop_block_192 +#else + BNE.N L_AES_GCM_encrypt_loop_block_192 +#endif B L_AES_GCM_encrypt_end L_AES_GCM_encrypt_start_block_128: L_AES_GCM_encrypt_loop_block_128: @@ -3106,7 +3274,11 @@ L_AES_GCM_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_GCM_encrypt_loop_block_128 +#else + BNE.N L_AES_GCM_encrypt_loop_block_128 +#endif L_AES_GCM_encrypt_end: POP {r3, r8} REV r4, r4 diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index ec9f3f38c..1564a6f9d 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -37,6 +37,18 @@ #endif /* HAVE_CONFIG_H */ #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifndef NO_AES #include @@ -206,7 +218,11 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "STM %[ks]!, {r6, r7, r8, r9}\n\t" "SUBS r11, r11, #0x2\n\t" "SUB r10, r10, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_invert_key_loop_%=\n\t" +#else + "BNE.N L_AES_invert_key_loop_%=\n\t" +#endif "SUB %[ks], %[ks], %[rounds], LSL #3\n\t" "ADD %[ks], %[ks], #0x10\n\t" "SUB r11, %[rounds], #0x1\n\t" @@ -278,7 +294,11 @@ void AES_invert_key(unsigned char* ks_p, word32 rounds_p) "EOR r8, r8, r9, ROR #24\n\t" "STR r8, [%[ks]], #4\n\t" "SUBS r11, r11, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_invert_key_mix_loop_%=\n\t" +#else + "BNE.N L_AES_invert_key_mix_loop_%=\n\t" +#endif : [ks] "+r" (ks), [rounds] "+r" (rounds), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) : : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" @@ -306,9 +326,17 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "MOV r8, %[L_AES_Thumb2_te]\n\t" "MOV lr, %[L_AES_Thumb2_rcon]\n\t" "CMP %[len], #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_set_encrypt_key_start_128_%=\n\t" +#else + "BEQ.N L_AES_set_encrypt_key_start_128_%=\n\t" +#endif "CMP %[len], #0xc0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_set_encrypt_key_start_192_%=\n\t" +#else + "BEQ.N L_AES_set_encrypt_key_start_192_%=\n\t" +#endif "LDRD r4, r5, [%[key]]\n\t" "LDRD r6, r7, [%[key], #8]\n\t" "REV r4, r4\n\t" @@ -369,7 +397,11 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "STM %[ks], {r4, r5, r6, r7}\n\t" "SUB %[ks], %[ks], #0x10\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_set_encrypt_key_loop_256_%=\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_256_%=\n\t" +#endif "UBFX r4, r7, #0, #8\n\t" "UBFX r5, r7, #8, #8\n\t" "UBFX r6, r7, #16, #8\n\t" @@ -431,7 +463,11 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "EOR r7, r7, r6\n\t" "STM %[ks], {r0, r1, r4, r5, r6, r7}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_set_encrypt_key_loop_192_%=\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_192_%=\n\t" +#endif "UBFX r0, r7, #0, #8\n\t" "UBFX r1, r7, #8, #8\n\t" "UBFX r4, r7, #16, #8\n\t" @@ -484,7 +520,11 @@ void AES_set_encrypt_key(const unsigned char* key_p, word32 len_p, unsigned char "EOR r7, r7, r6\n\t" "STM %[ks], {r4, r5, r6, r7}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_set_encrypt_key_loop_128_%=\n\t" +#else + "BNE.N L_AES_set_encrypt_key_loop_128_%=\n\t" +#endif "\n" "L_AES_set_encrypt_key_end_%=:\n\t" : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) @@ -605,7 +645,11 @@ void AES_encrypt_block(const uint32_t* te_p, int nr_p, int len_p, const uint32_t "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_encrypt_block_nr_%=\n\t" +#else + "BNE.N L_AES_encrypt_block_nr_%=\n\t" +#endif "UBFX r8, r5, #16, #8\n\t" "LSR r11, r4, #24\n\t" "UBFX lr, r6, #8, #8\n\t" @@ -733,9 +777,17 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "MOV r12, r4\n\t" "PUSH {%[ks]}\n\t" "CMP r12, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_ECB_encrypt_start_block_128_%=\n\t" +#else + "BEQ.N L_AES_ECB_encrypt_start_block_128_%=\n\t" +#endif "CMP r12, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_ECB_encrypt_start_block_192_%=\n\t" +#else + "BEQ.N L_AES_ECB_encrypt_start_block_192_%=\n\t" +#endif "\n" "L_AES_ECB_encrypt_loop_block_256_%=:\n\t" "LDR r4, [lr]\n\t" @@ -768,7 +820,11 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_encrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_ECB_encrypt_loop_block_256_%=\n\t" +#endif "B L_AES_ECB_encrypt_end_%=\n\t" "\n" "L_AES_ECB_encrypt_start_block_192_%=:\n\t" @@ -804,7 +860,11 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_encrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_ECB_encrypt_loop_block_192_%=\n\t" +#endif "B L_AES_ECB_encrypt_end_%=\n\t" "\n" "L_AES_ECB_encrypt_start_block_128_%=:\n\t" @@ -840,7 +900,11 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_encrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_ECB_encrypt_loop_block_128_%=\n\t" +#endif "\n" "L_AES_ECB_encrypt_end_%=:\n\t" "POP {%[ks]}\n\t" @@ -848,7 +912,6 @@ void AES_ECB_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; } #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ @@ -873,9 +936,17 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "LDM r9, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r9}\n\t" "CMP r8, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_encrypt_start_block_128_%=\n\t" +#else + "BEQ.N L_AES_CBC_encrypt_start_block_128_%=\n\t" +#endif "CMP r8, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_encrypt_start_block_192_%=\n\t" +#else + "BEQ.N L_AES_CBC_encrypt_start_block_192_%=\n\t" +#endif "\n" "L_AES_CBC_encrypt_loop_block_256_%=:\n\t" "LDR r8, [lr]\n\t" @@ -912,7 +983,11 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_encrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_CBC_encrypt_loop_block_256_%=\n\t" +#endif "B L_AES_CBC_encrypt_end_%=\n\t" "\n" "L_AES_CBC_encrypt_start_block_192_%=:\n\t" @@ -952,7 +1027,11 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_encrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_CBC_encrypt_loop_block_192_%=\n\t" +#endif "B L_AES_CBC_encrypt_end_%=\n\t" "\n" "L_AES_CBC_encrypt_start_block_128_%=:\n\t" @@ -992,7 +1071,11 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_encrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_CBC_encrypt_loop_block_128_%=\n\t" +#endif "\n" "L_AES_CBC_encrypt_end_%=:\n\t" "POP {%[ks], r9}\n\t" @@ -1001,8 +1084,6 @@ void AES_CBC_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)iv; } #endif /* HAVE_AES_CBC */ @@ -1032,9 +1113,17 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CTR_encrypt_start_block_128_%=\n\t" +#else + "BEQ.N L_AES_CTR_encrypt_start_block_128_%=\n\t" +#endif "CMP r12, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CTR_encrypt_start_block_192_%=\n\t" +#else + "BEQ.N L_AES_CTR_encrypt_start_block_192_%=\n\t" +#endif "\n" "L_AES_CTR_encrypt_loop_block_256_%=:\n\t" "PUSH {r1, %[len], lr}\n\t" @@ -1075,7 +1164,11 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CTR_encrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_CTR_encrypt_loop_block_256_%=\n\t" +#endif "B L_AES_CTR_encrypt_end_%=\n\t" "\n" "L_AES_CTR_encrypt_start_block_192_%=:\n\t" @@ -1119,7 +1212,11 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CTR_encrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_CTR_encrypt_loop_block_192_%=\n\t" +#endif "B L_AES_CTR_encrypt_end_%=\n\t" "\n" "L_AES_CTR_encrypt_start_block_128_%=:\n\t" @@ -1163,7 +1260,11 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CTR_encrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_CTR_encrypt_loop_block_128_%=\n\t" +#endif "\n" "L_AES_CTR_encrypt_end_%=:\n\t" "POP {%[ks], r8}\n\t" @@ -1176,8 +1277,6 @@ void AES_CTR_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)ctr; } #endif /* WOLFSSL_AES_COUNTER */ @@ -1294,7 +1393,11 @@ void AES_decrypt_block(const uint32_t* td_p, int nr_p, const uint8_t* td4_p) "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_decrypt_block_nr_%=\n\t" +#else + "BNE.N L_AES_decrypt_block_nr_%=\n\t" +#endif "UBFX r8, r7, #16, #8\n\t" "LSR r11, r4, #24\n\t" "UBFX r12, r6, #8, #8\n\t" @@ -1457,9 +1560,17 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "MOV r12, %[len]\n\t" "MOV r2, %[L_AES_Thumb2_td4]\n\t" "CMP r8, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_ECB_decrypt_start_block_128_%=\n\t" +#else + "BEQ.N L_AES_ECB_decrypt_start_block_128_%=\n\t" +#endif "CMP r8, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_ECB_decrypt_start_block_192_%=\n\t" +#else + "BEQ.N L_AES_ECB_decrypt_start_block_192_%=\n\t" +#endif "\n" "L_AES_ECB_decrypt_loop_block_256_%=:\n\t" "LDR r4, [lr]\n\t" @@ -1491,7 +1602,11 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_decrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_ECB_decrypt_loop_block_256_%=\n\t" +#endif "B L_AES_ECB_decrypt_end_%=\n\t" "\n" "L_AES_ECB_decrypt_start_block_192_%=:\n\t" @@ -1526,7 +1641,11 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_decrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_ECB_decrypt_loop_block_192_%=\n\t" +#endif "B L_AES_ECB_decrypt_end_%=\n\t" "\n" "L_AES_ECB_decrypt_start_block_128_%=:\n\t" @@ -1561,14 +1680,17 @@ void AES_ECB_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_ECB_decrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_ECB_decrypt_loop_block_128_%=\n\t" +#endif "\n" "L_AES_ECB_decrypt_end_%=:\n\t" : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; } #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ @@ -1595,9 +1717,17 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "MOV r2, %[L_AES_Thumb2_td4]\n\t" "PUSH {%[ks], r4}\n\t" "CMP r8, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#else + "BEQ.N L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#endif "CMP r8, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#else + "BEQ.N L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#endif "\n" "L_AES_CBC_decrypt_loop_block_256_%=:\n\t" "PUSH {r1, r12, lr}\n\t" @@ -1639,7 +1769,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#else + "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -1680,7 +1814,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_decrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_CBC_decrypt_loop_block_256_%=\n\t" +#endif "B L_AES_CBC_decrypt_end_%=\n\t" "\n" "L_AES_CBC_decrypt_loop_block_192_%=:\n\t" @@ -1723,7 +1861,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#else + "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -1764,7 +1906,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#endif "B L_AES_CBC_decrypt_end_%=\n\t" "\n" "L_AES_CBC_decrypt_loop_block_128_%=:\n\t" @@ -1807,7 +1953,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#else + "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" +#endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -1848,7 +1998,11 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#endif "B L_AES_CBC_decrypt_end_%=\n\t" "\n" "L_AES_CBC_decrypt_end_odd_%=:\n\t" @@ -1864,8 +2018,6 @@ void AES_CBC_decrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)iv; } #endif /* HAVE_AES_CBC */ @@ -2437,7 +2589,11 @@ void GCM_gmult_len(unsigned char* x_p, const unsigned char** m_p, const unsigned "POP {r3}\n\t" "SUBS %[len], %[len], #0x10\n\t" "ADD %[data], %[data], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_GCM_gmult_len_start_block_%=\n\t" +#else + "BNE.N L_GCM_gmult_len_start_block_%=\n\t" +#endif : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) : : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11" @@ -2470,9 +2626,17 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_GCM_encrypt_start_block_128_%=\n\t" +#else + "BEQ.N L_AES_GCM_encrypt_start_block_128_%=\n\t" +#endif "CMP r12, #0xc\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BEQ L_AES_GCM_encrypt_start_block_192_%=\n\t" +#else + "BEQ.N L_AES_GCM_encrypt_start_block_192_%=\n\t" +#endif "\n" "L_AES_GCM_encrypt_loop_block_256_%=:\n\t" "PUSH {r1, %[len], lr}\n\t" @@ -2510,7 +2674,11 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_GCM_encrypt_loop_block_256_%=\n\t" +#else + "BNE.N L_AES_GCM_encrypt_loop_block_256_%=\n\t" +#endif "B L_AES_GCM_encrypt_end_%=\n\t" "\n" "L_AES_GCM_encrypt_start_block_192_%=:\n\t" @@ -2551,7 +2719,11 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_GCM_encrypt_loop_block_192_%=\n\t" +#else + "BNE.N L_AES_GCM_encrypt_loop_block_192_%=\n\t" +#endif "B L_AES_GCM_encrypt_end_%=\n\t" "\n" "L_AES_GCM_encrypt_start_block_128_%=:\n\t" @@ -2592,7 +2764,11 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_AES_GCM_encrypt_loop_block_128_%=\n\t" +#else + "BNE.N L_AES_GCM_encrypt_loop_block_128_%=\n\t" +#endif "\n" "L_AES_GCM_encrypt_end_%=:\n\t" "POP {%[ks], r8}\n\t" @@ -2605,12 +2781,13 @@ void AES_GCM_encrypt(const unsigned char* in_p, unsigned char* out_p, unsigned l : : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); - (void)nr; - (void)ctr; } #endif /* HAVE_AESGCM */ #endif /* !NO_AES */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S index b836b4749..c5ca56b18 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519.S +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -2741,12 +2741,20 @@ L_curve25519_bits: LDR r1, [sp, #180] SUBS r1, r1, #0x1 STR r1, [sp, #180] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BGE L_curve25519_bits +#else + BGE.N L_curve25519_bits +#endif MOV r1, #0x1f STR r1, [sp, #180] SUBS r2, r2, #0x4 STR r2, [sp, #176] +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BGE L_curve25519_words +#else + BGE.N L_curve25519_words +#endif # Invert ADD r1, sp, #0x0 ADD r0, sp, #0x20 @@ -2783,7 +2791,11 @@ L_curve25519_inv_1: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_1 +#else + BNE.N L_curve25519_inv_1 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -2799,7 +2811,11 @@ L_curve25519_inv_2: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_2 +#else + BNE.N L_curve25519_inv_2 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x60 @@ -2815,7 +2831,11 @@ L_curve25519_inv_3: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_3 +#else + BNE.N L_curve25519_inv_3 +#endif ADD r2, sp, #0x60 ADD r1, sp, #0x80 ADD r0, sp, #0x60 @@ -2828,7 +2848,11 @@ L_curve25519_inv_4: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_4 +#else + BNE.N L_curve25519_inv_4 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -2844,7 +2868,11 @@ L_curve25519_inv_5: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_5 +#else + BNE.N L_curve25519_inv_5 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x60 @@ -2860,7 +2888,11 @@ L_curve25519_inv_6: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_6 +#else + BNE.N L_curve25519_inv_6 +#endif ADD r2, sp, #0x60 ADD r1, sp, #0x80 ADD r0, sp, #0x60 @@ -2873,7 +2905,11 @@ L_curve25519_inv_7: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_7 +#else + BNE.N L_curve25519_inv_7 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -2886,7 +2922,11 @@ L_curve25519_inv_8: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_8 +#else + BNE.N L_curve25519_inv_8 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x0 @@ -3022,7 +3062,11 @@ L_curve25519_bits: BL fe_mul_op LDR r2, [sp, #168] SUBS r2, r2, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BGE L_curve25519_bits +#else + BGE.N L_curve25519_bits +#endif # Cycle Count: 171 LDR r1, [sp, #184] # Copy @@ -3064,7 +3108,11 @@ L_curve25519_inv_1: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_1 +#else + BNE.N L_curve25519_inv_1 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -3080,7 +3128,11 @@ L_curve25519_inv_2: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_2 +#else + BNE.N L_curve25519_inv_2 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x60 @@ -3096,7 +3148,11 @@ L_curve25519_inv_3: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_3 +#else + BNE.N L_curve25519_inv_3 +#endif ADD r2, sp, #0x60 ADD r1, sp, #0x80 ADD r0, sp, #0x60 @@ -3109,7 +3165,11 @@ L_curve25519_inv_4: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_4 +#else + BNE.N L_curve25519_inv_4 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -3125,7 +3185,11 @@ L_curve25519_inv_5: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_5 +#else + BNE.N L_curve25519_inv_5 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x60 @@ -3141,7 +3205,11 @@ L_curve25519_inv_6: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_6 +#else + BNE.N L_curve25519_inv_6 +#endif ADD r2, sp, #0x60 ADD r1, sp, #0x80 ADD r0, sp, #0x60 @@ -3154,7 +3222,11 @@ L_curve25519_inv_7: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_7 +#else + BNE.N L_curve25519_inv_7 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -3167,7 +3239,11 @@ L_curve25519_inv_8: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_curve25519_inv_8 +#else + BNE.N L_curve25519_inv_8 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x0 @@ -3244,7 +3320,11 @@ L_fe_invert1: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert1 +#else + BNE.N L_fe_invert1 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x20 @@ -3260,7 +3340,11 @@ L_fe_invert2: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert2 +#else + BNE.N L_fe_invert2 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x40 @@ -3276,7 +3360,11 @@ L_fe_invert3: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert3 +#else + BNE.N L_fe_invert3 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -3289,7 +3377,11 @@ L_fe_invert4: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert4 +#else + BNE.N L_fe_invert4 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x20 @@ -3305,7 +3397,11 @@ L_fe_invert5: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert5 +#else + BNE.N L_fe_invert5 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x40 @@ -3321,7 +3417,11 @@ L_fe_invert6: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert6 +#else + BNE.N L_fe_invert6 +#endif ADD r2, sp, #0x40 ADD r1, sp, #0x60 ADD r0, sp, #0x40 @@ -3334,7 +3434,11 @@ L_fe_invert7: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert7 +#else + BNE.N L_fe_invert7 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x20 @@ -3347,7 +3451,11 @@ L_fe_invert8: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_invert8 +#else + BNE.N L_fe_invert8 +#endif MOV r2, sp ADD r1, sp, #0x20 LDR r0, [sp, #128] @@ -3863,7 +3971,11 @@ L_fe_pow22523_1: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_1 +#else + BNE.N L_fe_pow22523_1 +#endif MOV r2, sp ADD r1, sp, #0x20 MOV r0, sp @@ -3879,7 +3991,11 @@ L_fe_pow22523_2: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_2 +#else + BNE.N L_fe_pow22523_2 +#endif MOV r2, sp ADD r1, sp, #0x20 ADD r0, sp, #0x20 @@ -3895,7 +4011,11 @@ L_fe_pow22523_3: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_3 +#else + BNE.N L_fe_pow22523_3 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x20 @@ -3908,7 +4028,11 @@ L_fe_pow22523_4: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_4 +#else + BNE.N L_fe_pow22523_4 +#endif MOV r2, sp ADD r1, sp, #0x20 MOV r0, sp @@ -3924,7 +4048,11 @@ L_fe_pow22523_5: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_5 +#else + BNE.N L_fe_pow22523_5 +#endif MOV r2, sp ADD r1, sp, #0x20 ADD r0, sp, #0x20 @@ -3940,7 +4068,11 @@ L_fe_pow22523_6: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_6 +#else + BNE.N L_fe_pow22523_6 +#endif ADD r2, sp, #0x20 ADD r1, sp, #0x40 ADD r0, sp, #0x20 @@ -3953,7 +4085,11 @@ L_fe_pow22523_7: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_7 +#else + BNE.N L_fe_pow22523_7 +#endif MOV r2, sp ADD r1, sp, #0x20 MOV r0, sp @@ -3966,7 +4102,11 @@ L_fe_pow22523_8: BL fe_sq_op POP {r12} SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_fe_pow22523_8 +#else + BNE.N L_fe_pow22523_8 +#endif LDR r2, [sp, #100] MOV r1, sp LDR r0, [sp, #96] diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index 4df206607..2018b8e9d 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -37,6 +37,18 @@ #endif /* HAVE_CONFIG_H */ #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ /* Based on work by: Emil Lenngren * https://github.com/pornin/X25519-Cortex-M4 */ @@ -2815,12 +2827,20 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "LDR %[n], [sp, #180]\n\t" "SUBS %[n], %[n], #0x1\n\t" "STR %[n], [sp, #180]\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGE L_curve25519_bits_%=\n\t" +#else + "BGE.N L_curve25519_bits_%=\n\t" +#endif "MOV %[n], #0x1f\n\t" "STR %[n], [sp, #180]\n\t" "SUBS %[a], %[a], #0x4\n\t" "STR %[a], [sp, #176]\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGE L_curve25519_words_%=\n\t" +#else + "BGE.N L_curve25519_words_%=\n\t" +#endif /* Invert */ "ADD r1, sp, #0x0\n\t" "ADD r0, sp, #0x20\n\t" @@ -2858,7 +2878,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_1_%=\n\t" +#else + "BNE.N L_curve25519_inv_1_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -2875,7 +2899,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_2_%=\n\t" +#else + "BNE.N L_curve25519_inv_2_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" @@ -2892,7 +2920,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_3_%=\n\t" +#else + "BNE.N L_curve25519_inv_3_%=\n\t" +#endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x60\n\t" @@ -2906,7 +2938,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_4_%=\n\t" +#else + "BNE.N L_curve25519_inv_4_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -2923,7 +2959,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_5_%=\n\t" +#else + "BNE.N L_curve25519_inv_5_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" @@ -2940,7 +2980,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_6_%=\n\t" +#else + "BNE.N L_curve25519_inv_6_%=\n\t" +#endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x60\n\t" @@ -2954,7 +2998,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_7_%=\n\t" +#else + "BNE.N L_curve25519_inv_7_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -2968,7 +3016,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_8_%=\n\t" +#else + "BNE.N L_curve25519_inv_8_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x0\n\t" @@ -3110,7 +3162,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_mul_op\n\t" "LDR %[a], [sp, #168]\n\t" "SUBS %[a], %[a], #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGE L_curve25519_bits_%=\n\t" +#else + "BGE.N L_curve25519_bits_%=\n\t" +#endif /* Cycle Count: 171 */ "LDR %[n], [sp, #184]\n\t" /* Copy */ @@ -3153,7 +3209,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_1_%=\n\t" +#else + "BNE.N L_curve25519_inv_1_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -3170,7 +3230,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_2_%=\n\t" +#else + "BNE.N L_curve25519_inv_2_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" @@ -3187,7 +3251,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_3_%=\n\t" +#else + "BNE.N L_curve25519_inv_3_%=\n\t" +#endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x60\n\t" @@ -3201,7 +3269,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_4_%=\n\t" +#else + "BNE.N L_curve25519_inv_4_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -3218,7 +3290,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_5_%=\n\t" +#else + "BNE.N L_curve25519_inv_5_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" @@ -3235,7 +3311,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_6_%=\n\t" +#else + "BNE.N L_curve25519_inv_6_%=\n\t" +#endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x60\n\t" @@ -3249,7 +3329,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_7_%=\n\t" +#else + "BNE.N L_curve25519_inv_7_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -3263,7 +3347,11 @@ int curve25519(byte* r_p, const byte* n_p, const byte* a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_curve25519_inv_8_%=\n\t" +#else + "BNE.N L_curve25519_inv_8_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x0\n\t" @@ -3345,7 +3433,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert1_%=\n\t" +#else + "BNE.N L_fe_invert1_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x20\n\t" @@ -3362,7 +3454,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert2_%=\n\t" +#else + "BNE.N L_fe_invert2_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" @@ -3379,7 +3475,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert3_%=\n\t" +#else + "BNE.N L_fe_invert3_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -3393,7 +3493,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert4_%=\n\t" +#else + "BNE.N L_fe_invert4_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x20\n\t" @@ -3410,7 +3514,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert5_%=\n\t" +#else + "BNE.N L_fe_invert5_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" @@ -3427,7 +3535,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert6_%=\n\t" +#else + "BNE.N L_fe_invert6_%=\n\t" +#endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x40\n\t" @@ -3441,7 +3553,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert7_%=\n\t" +#else + "BNE.N L_fe_invert7_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x20\n\t" @@ -3455,7 +3571,11 @@ void fe_invert(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_invert8_%=\n\t" +#else + "BNE.N L_fe_invert8_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "LDR r0, [sp, #128]\n\t" @@ -3981,7 +4101,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_1_%=\n\t" +#else + "BNE.N L_fe_pow22523_1_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "MOV r0, sp\n\t" @@ -3998,7 +4122,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_2_%=\n\t" +#else + "BNE.N L_fe_pow22523_2_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" @@ -4015,7 +4143,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_3_%=\n\t" +#else + "BNE.N L_fe_pow22523_3_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x20\n\t" @@ -4029,7 +4161,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_4_%=\n\t" +#else + "BNE.N L_fe_pow22523_4_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "MOV r0, sp\n\t" @@ -4046,7 +4182,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_5_%=\n\t" +#else + "BNE.N L_fe_pow22523_5_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" @@ -4063,7 +4203,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_6_%=\n\t" +#else + "BNE.N L_fe_pow22523_6_%=\n\t" +#endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x20\n\t" @@ -4077,7 +4221,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_7_%=\n\t" +#else + "BNE.N L_fe_pow22523_7_%=\n\t" +#endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" "MOV r0, sp\n\t" @@ -4091,7 +4239,11 @@ void fe_pow22523(fe r_p, const fe a_p) "BL fe_sq_op\n\t" "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_fe_pow22523_8_%=\n\t" +#else + "BNE.N L_fe_pow22523_8_%=\n\t" +#endif "LDR r2, [sp, #100]\n\t" "MOV r1, sp\n\t" "LDR r0, [sp, #96]\n\t" @@ -5289,7 +5441,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "SUB sp, sp, #0x50\n\t" "ADD lr, sp, #0x44\n\t" "STM lr, {%[s], %[a], %[c]}\n\t" - "MOV %[r], #0x0\n\t" + "MOV %[s], #0x0\n\t" "LDR r12, [%[a]]\n\t" /* A[0] * B[0] */ "LDR lr, [%[b]]\n\t" @@ -5306,25 +5458,25 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "STR %[c], [sp]\n\t" /* A[0] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "MOV r11, %[r]\n\t" + "MOV r11, %[s]\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[0] * B[3] */ "LDR lr, [%[b], #12]\n\t" "ADCS r6, r6, #0x0\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[0] * B[5] */ "LDR lr, [%[b], #20]\n\t" "ADCS r8, r8, #0x0\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[0] * B[7] */ "LDR lr, [%[b], #28]\n\t" "ADCS r10, r10, #0x0\n\t" - "ADC %[c], %[r], #0x0\n\t" + "ADC %[c], %[s], #0x0\n\t" "UMLAL r10, %[c], r12, lr\n\t" /* A[1] * B[0] */ "LDR r12, [%[a], #4]\n\t" @@ -5335,37 +5487,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r5, r5, r11\n\t" /* A[1] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r5, r11, r12, lr\n\t" "ADDS r6, r6, r11\n\t" /* A[1] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[1] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r7, r11, r12, lr\n\t" "ADDS r8, r8, r11\n\t" /* A[1] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[1] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r9, r11, r12, lr\n\t" "ADDS r10, r10, r11\n\t" /* A[1] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[1] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r4, %[r], #0x0\n\t" + "ADC r4, %[s], #0x0\n\t" "UMLAL %[c], r4, r12, lr\n\t" /* A[2] * B[0] */ "LDR r12, [%[a], #8]\n\t" @@ -5376,37 +5528,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r6, r6, r11\n\t" /* A[2] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[2] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r7, r11, r12, lr\n\t" "ADDS r8, r8, r11\n\t" /* A[2] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[2] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r9, r11, r12, lr\n\t" "ADDS r10, r10, r11\n\t" /* A[2] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[2] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[2] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r5, %[r], #0x0\n\t" + "ADC r5, %[s], #0x0\n\t" "UMLAL r4, r5, r12, lr\n\t" /* A[3] * B[0] */ "LDR r12, [%[a], #12]\n\t" @@ -5417,37 +5569,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r7, r7, r11\n\t" /* A[3] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r7, r11, r12, lr\n\t" "ADDS r8, r8, r11\n\t" /* A[3] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[3] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r9, r11, r12, lr\n\t" "ADDS r10, r10, r11\n\t" /* A[3] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[3] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[3] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[3] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r6, %[r], #0x0\n\t" + "ADC r6, %[s], #0x0\n\t" "UMLAL r5, r6, r12, lr\n\t" /* A[4] * B[0] */ "LDR r12, [%[a], #16]\n\t" @@ -5458,37 +5610,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r8, r8, r11\n\t" /* A[4] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[4] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r9, r11, r12, lr\n\t" "ADDS r10, r10, r11\n\t" /* A[4] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[4] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[4] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[4] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r5, r11, r12, lr\n\t" "ADDS r6, r6, r11\n\t" /* A[4] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r7, %[r], #0x0\n\t" + "ADC r7, %[s], #0x0\n\t" "UMLAL r6, r7, r12, lr\n\t" /* A[5] * B[0] */ "LDR r12, [%[a], #20]\n\t" @@ -5499,37 +5651,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r9, r9, r11\n\t" /* A[5] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r9, r11, r12, lr\n\t" "ADDS r10, r10, r11\n\t" /* A[5] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[5] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[5] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[5] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r5, r11, r12, lr\n\t" "ADDS r6, r6, r11\n\t" /* A[5] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[5] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r8, %[r], #0x0\n\t" + "ADC r8, %[s], #0x0\n\t" "UMLAL r7, r8, r12, lr\n\t" /* A[6] * B[0] */ "LDR r12, [%[a], #24]\n\t" @@ -5540,37 +5692,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS r10, r10, r11\n\t" /* A[6] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r10, r11, r12, lr\n\t" "ADDS %[c], %[c], r11\n\t" /* A[6] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[6] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[6] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r5, r11, r12, lr\n\t" "ADDS r6, r6, r11\n\t" /* A[6] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[6] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r7, r11, r12, lr\n\t" "ADDS r8, r8, r11\n\t" /* A[6] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r9, %[r], #0x0\n\t" + "ADC r9, %[s], #0x0\n\t" "UMLAL r8, r9, r12, lr\n\t" /* A[7] * B[0] */ "LDR r12, [%[a], #28]\n\t" @@ -5581,37 +5733,37 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) "ADDS %[c], %[c], r11\n\t" /* A[7] * B[1] */ "LDR lr, [%[b], #4]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL %[c], r11, r12, lr\n\t" "ADDS r4, r4, r11\n\t" /* A[7] * B[2] */ "LDR lr, [%[b], #8]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r4, r11, r12, lr\n\t" "ADDS r5, r5, r11\n\t" /* A[7] * B[3] */ "LDR lr, [%[b], #12]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r5, r11, r12, lr\n\t" "ADDS r6, r6, r11\n\t" /* A[7] * B[4] */ "LDR lr, [%[b], #16]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r6, r11, r12, lr\n\t" "ADDS r7, r7, r11\n\t" /* A[7] * B[5] */ "LDR lr, [%[b], #20]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r7, r11, r12, lr\n\t" "ADDS r8, r8, r11\n\t" /* A[7] * B[6] */ "LDR lr, [%[b], #24]\n\t" - "ADC r11, %[r], #0x0\n\t" + "ADC r11, %[s], #0x0\n\t" "UMLAL r8, r11, r12, lr\n\t" "ADDS r9, r9, r11\n\t" /* A[7] * B[7] */ "LDR lr, [%[b], #28]\n\t" - "ADC r10, %[r], #0x0\n\t" + "ADC r10, %[s], #0x0\n\t" "UMLAL r9, r10, r12, lr\n\t" "ADD lr, sp, #0x20\n\t" "STM lr, {%[c], r4, r5, r6, r7, r8, r9, r10}\n\t" @@ -6505,4 +6657,7 @@ void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) #endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S index 91dc10b37..7c59e2548 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -925,7 +925,11 @@ L_SHA256_transform_len_start: STR r9, [sp, #60] ADD r3, r3, #0x40 SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_SHA256_transform_len_start +#else + BNE.N L_SHA256_transform_len_start +#endif # Round 0 LDR r5, [r0, #16] LDR r6, [r0, #20] @@ -1466,7 +1470,11 @@ L_SHA256_transform_len_start: SUBS r2, r2, #0x40 SUB r3, r3, #0xc0 ADD r1, r1, #0x40 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_SHA256_transform_len_begin +#else + BNE.N L_SHA256_transform_len_begin +#endif ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} # Cycle Count = 1874 diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c index a21a607fe..43659fb07 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -37,6 +37,18 @@ #endif /* HAVE_CONFIG_H */ #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifndef NO_SHA256 #include @@ -885,7 +897,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "STR r9, [sp, #60]\n\t" "ADD r3, r3, #0x40\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_SHA256_transform_len_start_%=\n\t" +#else + "BNE.N L_SHA256_transform_len_start_%=\n\t" +#endif /* Round 0 */ "LDR r5, [%[sha256], #16]\n\t" "LDR r6, [%[sha256], #20]\n\t" @@ -1426,7 +1442,11 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) "SUBS %[len], %[len], #0x40\n\t" "SUB r3, r3, #0xc0\n\t" "ADD %[data], %[data], #0x40\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_SHA256_transform_len_begin_%=\n\t" +#else + "BNE.N L_SHA256_transform_len_begin_%=\n\t" +#endif "ADD sp, sp, #0xc0\n\t" : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) : @@ -1438,4 +1458,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256_p, const byte* data_p, word32 len_p) #endif /* !NO_SHA256 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S index 34912f431..b420e7863 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S @@ -2319,7 +2319,11 @@ L_SHA512_transform_len_start: STRD r4, r5, [sp, #120] ADD r3, r3, #0x80 SUBS r12, r12, #0x1 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_SHA512_transform_len_start +#else + BNE.N L_SHA512_transform_len_start +#endif # Round 0 LDRD r4, r5, [r0, #32] LSRS r6, r4, #14 @@ -3652,7 +3656,11 @@ L_SHA512_transform_len_start: SUBS r2, r2, #0x80 SUB r3, r3, #0x200 ADD r1, r1, #0x80 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_SHA512_transform_len_begin +#else + BNE.N L_SHA512_transform_len_begin +#endif EOR r0, r0, r0 ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c index 9a0cd79c6..d62a035cc 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -37,6 +37,18 @@ #endif /* HAVE_CONFIG_H */ #include #ifdef WOLFSSL_ARMASM_INLINE + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__arm__) + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ #ifdef WOLFSSL_SHA512 #include @@ -2207,7 +2219,11 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "STRD r4, r5, [sp, #120]\n\t" "ADD r3, r3, #0x80\n\t" "SUBS r12, r12, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_SHA512_transform_len_start_%=\n\t" +#else + "BNE.N L_SHA512_transform_len_start_%=\n\t" +#endif /* Round 0 */ "LDRD r4, r5, [%[sha512], #32]\n\t" "LSRS r6, r4, #14\n\t" @@ -3540,7 +3556,11 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) "SUBS %[len], %[len], #0x80\n\t" "SUB r3, r3, #0x200\n\t" "ADD %[data], %[data], #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BNE L_SHA512_transform_len_begin_%=\n\t" +#else + "BNE.N L_SHA512_transform_len_begin_%=\n\t" +#endif "EOR r0, r0, r0\n\t" "ADD sp, sp, #0xc0\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) @@ -3553,4 +3573,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512_p, const byte* data_p, word32 len_p) #endif /* WOLFSSL_SHA512 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ +#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* WOLFSSL_ARMASM */ + #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 8a5bb7e6f..82ce50347 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -52,6 +52,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifdef WOLFSSL_SP_ARM32_ASM #define SP_PRINT_NUM(var, name, total, words, bits) \ do { \ @@ -118,14 +127,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -230,6 +239,7 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) #define sp_2048_norm_64(a) #ifndef WOLFSSL_SP_SMALL +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Multiply a and b into r. (r = a * b) * * r A single precision integer. @@ -238,17 +248,15 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) */ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #32\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -271,14 +279,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adc r4, r4, r7\n\t" "mov r5, #0\n\t" -#else - "umull r3, r4, r11, r12\n\t" - "mov r5, #0\n\t" -#endif "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -308,16 +311,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -346,16 +341,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -385,17 +373,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -424,16 +404,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -462,16 +435,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -501,16 +467,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -539,15 +497,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -576,16 +527,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -614,16 +558,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -653,16 +590,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -691,16 +620,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -729,16 +651,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -767,16 +682,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -805,16 +713,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -844,17 +745,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -883,15 +776,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -920,15 +806,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -957,16 +836,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -995,16 +867,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1033,16 +898,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1072,17 +930,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1111,15 +961,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1148,16 +991,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1186,16 +1022,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1224,16 +1053,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1262,16 +1084,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1300,16 +1115,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1339,17 +1147,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1378,16 +1178,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1416,15 +1209,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1453,15 +1239,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1490,16 +1269,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1528,16 +1300,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1566,16 +1331,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1604,16 +1362,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #28]\n\t" /* A[7] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1643,17 +1394,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1682,15 +1425,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1719,16 +1455,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1757,16 +1486,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1795,16 +1517,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1833,16 +1548,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1871,16 +1579,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [%[r], #32]\n\t" /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1910,17 +1611,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1949,15 +1642,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -1986,15 +1672,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2023,16 +1702,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2061,16 +1733,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2099,16 +1764,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [%[r], #36]\n\t" /* A[7] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2138,16 +1796,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2176,16 +1826,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2214,16 +1857,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2252,16 +1888,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2290,16 +1919,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #40]\n\t" /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2329,16 +1951,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2367,15 +1981,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2404,16 +2011,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2442,15 +2042,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [%[r], #44]\n\t" /* A[7] * B[5] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2480,17 +2073,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2519,16 +2104,9 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2557,15 +2135,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [%[r], #48]\n\t" /* A[6] * B[7] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2595,16 +2166,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2633,15 +2196,8 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #52]\n\t" /* A[7] * B[7] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -2666,11 +2222,6 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r7\n\t" -#endif "str r5, [%[r], #56]\n\t" "str r3, [%[r], #60]\n\t" "ldm sp!, {r3, r4, r5, r6}\n\t" @@ -2679,10 +2230,495 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "stm %[r]!, {r3, r4, r5, r6}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #36\n\t" + "str %[r], [sp, #32]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "ldr lr, [%[b]]\n\t" + "umull r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "umull r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "umull r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "umull r9, r10, r12, lr\n\t" + "str r3, [sp]\n\t" + /* A[0] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "mov r11, %[r]\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[0] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adcs r6, r6, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[0] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adcs r8, r8, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[0] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adcs r10, r10, #0\n\t" + "adc r3, %[r], #0\n\t" + "umlal r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "str r4, [sp, #4]\n\t" + "adds r5, r5, r11\n\t" + /* A[1] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[1] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "str r5, [sp, #8]\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[2] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[2] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[3] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[3] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[3] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[4] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[4] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[4] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[4] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[4] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[5] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[5] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[5] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[5] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[5] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[6] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[6] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[6] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[6] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[6] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[6] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r9, %[r], #0\n\t" + "umlal r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "ldr r12, [%[a], #28]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[7] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[7] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[7] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[7] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[7] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[7] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r10, %[r], #0\n\t" + "umlal r9, r10, r12, lr\n\t" + "ldr %[r], [sp, #32]\n\t" + "add %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #36\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #44\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[r], [sp, #36]\n\t" + "str %[a], [sp, #40]\n\t" +#else + "strd %[r], %[a], [sp, #36]\n\t" +#endif + "mov lr, %[b]\n\t" + "ldm %[a], {%[r], %[a], %[b], r3}\n\t" + "ldm lr!, {r4, r5, r6}\n\t" + "umull r10, r11, %[r], r4\n\t" + "umull r12, r7, %[a], r4\n\t" + "umaal r11, r12, %[r], r5\n\t" + "umull r8, r9, %[b], r4\n\t" + "umaal r12, r8, %[a], r5\n\t" + "umaal r12, r7, %[r], r6\n\t" + "umaal r8, r9, r3, r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + "umaal r7, r8, %[b], r5\n\t" + "ldm lr!, {r4}\n\t" + "umull r10, r11, %[a], r6\n\t" + "umaal r8, r9, %[b], r6\n\t" + "umaal r7, r10, %[r], r4\n\t" + "umaal r8, r11, r3, r5\n\t" + "str r7, [sp, #12]\n\t" + "umaal r8, r10, %[a], r4\n\t" + "umaal r9, r11, r3, r6\n\t" + "umaal r9, r10, %[b], r4\n\t" + "umaal r10, r11, r3, r4\n\t" + "ldm lr, {r4, r5, r6, r7}\n\t" + "mov r12, #0\n\t" + "umlal r8, r12, %[r], r4\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r10, r12, %[b], r4\n\t" + "umaal r11, r12, r3, r4\n\t" + "mov r4, #0\n\t" + "umlal r9, r4, %[r], r5\n\t" + "umaal r10, r4, %[a], r5\n\t" + "umaal r11, r4, %[b], r5\n\t" + "umaal r12, r4, r3, r5\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, %[r], r6\n\t" + "umaal r11, r5, %[a], r6\n\t" + "umaal r12, r5, %[b], r6\n\t" + "umaal r4, r5, r3, r6\n\t" + "mov r6, #0\n\t" + "umlal r11, r6, %[r], r7\n\t" + "ldr %[r], [sp, #40]\n\t" + "umaal r12, r6, %[a], r7\n\t" + "add %[r], %[r], #16\n\t" + "umaal r4, r6, %[b], r7\n\t" + "sub lr, lr, #16\n\t" + "umaal r5, r6, r3, r7\n\t" + "ldm %[r], {%[r], %[a], %[b], r3}\n\t" + "str r6, [sp, #32]\n\t" + "ldm lr!, {r6}\n\t" + "mov r7, #0\n\t" + "umlal r8, r7, %[r], r6\n\t" + "umaal r9, r7, %[a], r6\n\t" + "str r8, [sp, #16]\n\t" + "umaal r10, r7, %[b], r6\n\t" + "umaal r11, r7, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r8, #0\n\t" + "umlal r9, r8, %[r], r6\n\t" + "umaal r10, r8, %[a], r6\n\t" + "str r9, [sp, #20]\n\t" + "umaal r11, r8, %[b], r6\n\t" + "umaal r12, r8, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r9, #0\n\t" + "umlal r10, r9, %[r], r6\n\t" + "umaal r11, r9, %[a], r6\n\t" + "str r10, [sp, #24]\n\t" + "umaal r12, r9, %[b], r6\n\t" + "umaal r4, r9, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r10, #0\n\t" + "umlal r11, r10, %[r], r6\n\t" + "umaal r12, r10, %[a], r6\n\t" + "str r11, [sp, #28]\n\t" + "umaal r4, r10, %[b], r6\n\t" + "umaal r5, r10, r3, r6\n\t" + "ldm lr!, {r11}\n\t" + "umaal r12, r7, %[r], r11\n\t" + "umaal r4, r7, %[a], r11\n\t" + "ldr r6, [sp, #32]\n\t" + "umaal r5, r7, %[b], r11\n\t" + "umaal r6, r7, r3, r11\n\t" + "ldm lr!, {r11}\n\t" + "umaal r4, r8, %[r], r11\n\t" + "umaal r5, r8, %[a], r11\n\t" + "umaal r6, r8, %[b], r11\n\t" + "umaal r7, r8, r3, r11\n\t" + "ldm lr, {r11, lr}\n\t" + "umaal r5, r9, %[r], r11\n\t" + "umaal r6, r10, %[r], lr\n\t" + "umaal r6, r9, %[a], r11\n\t" + "umaal r7, r10, %[a], lr\n\t" + "umaal r7, r9, %[b], r11\n\t" + "umaal r8, r10, %[b], lr\n\t" + "umaal r8, r9, r3, r11\n\t" + "umaal r9, r10, r3, lr\n\t" + "mov r3, r12\n\t" + "ldr lr, [sp, #36]\n\t" + "add lr, lr, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub lr, lr, #32\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #44\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); +} + +#endif /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -2691,12 +2727,11 @@ static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ */ static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -2711,10 +2746,11 @@ static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -2726,8 +2762,8 @@ static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -2774,12 +2810,11 @@ static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_2048_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -2808,10 +2843,11 @@ static sp_digit sp_2048_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -2888,8 +2924,8 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, */ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -2964,12 +3000,11 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -3026,10 +3061,11 @@ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -3110,8 +3146,8 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, */ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -3242,12 +3278,11 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -3360,10 +3395,11 @@ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -3437,6 +3473,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, (void)sp_2048_add_32(r + 96, r + 96, a1); } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -3444,14 +3481,13 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, */ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #32\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -3462,15 +3498,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r2, r2, #17\n\t" "adds r8, r8, r2\n\t" "adc r3, r3, r9\n\t" -#else - "umull r8, r3, r10, r10\n\t" -#endif "mov r4, #0\n\t" "str r8, [sp]\n\t" /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3511,22 +3543,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [sp, #4]\n\t" /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3567,20 +3587,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3600,17 +3608,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #8]\n\t" /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3651,21 +3652,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3705,20 +3694,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "str r2, [sp, #12]\n\t" /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3759,21 +3738,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3813,18 +3780,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3844,17 +3801,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [sp, #16]\n\t" /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3876,15 +3826,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r3, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3913,16 +3859,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3951,12 +3890,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -3967,7 +3900,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -3989,15 +3921,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r4, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4026,16 +3954,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4064,15 +3985,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4095,15 +4009,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "adds r2, r2, r5\n\t" "adcs r3, r3, r6\n\t" "adc r4, r4, r7\n\t" @@ -4111,7 +4016,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4133,15 +4037,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r2, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4170,16 +4070,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4208,16 +4101,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4246,12 +4132,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -4262,7 +4142,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4284,15 +4163,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r3, #0\n\t" "mov r7, #0\n\t" /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4321,16 +4196,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4359,15 +4227,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4390,15 +4251,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "adds r4, r4, r5\n\t" "adcs r2, r2, r6\n\t" "adc r3, r3, r7\n\t" @@ -4406,7 +4258,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4428,15 +4279,11 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r4, #0\n\t" "mov r7, #0\n\t" /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4465,16 +4312,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4503,12 +4343,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -4519,7 +4353,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4560,21 +4393,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4614,18 +4435,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4645,17 +4456,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [%[r], #40]\n\t" /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4696,21 +4500,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4750,20 +4542,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #44]\n\t" /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4804,20 +4586,8 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4837,17 +4607,10 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "str r2, [%[r], #48]\n\t" /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4888,21 +4651,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [%[r], #52]\n\t" /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -4920,11 +4671,6 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #17\n\t" "adds r4, r4, r8\n\t" "adc r2, r2, r9\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adc r2, r2, r9\n\t" -#endif "str r4, [%[r], #56]\n\t" "str r2, [%[r], #60]\n\t" "ldm sp!, {r2, r3, r4, r8}\n\t" @@ -4937,6 +4683,366 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) ); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str %[r], [sp, #64]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * A[1] */ + "ldr lr, [%[a], #4]\n\t" + "umull r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "umull r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "umull r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "umull r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[0] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[0] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + "adcs r3, r3, #0\n\t" + "str r4, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[a], #12]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[a], #16]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[a], #20]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[a], #24]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[a], #28]\n\t" + "mov r9, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "stm lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, %[r], #0\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov lr, sp\n\t" + /* A[0] * A[0] */ + "ldr r12, [%[a]]\n\t" + "umull r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[1] * A[1] */ + "ldr r12, [%[a], #4]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * A[2] */ + "ldr r12, [%[a], #8]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * A[3] */ + "ldr r12, [%[a], #12]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, r12\n\t" + "adds r10, r10, r11\n\t" + "stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "ldr r12, [%[a], #16]\n\t" + "adcs r3, r3, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * A[5] */ + "ldr r12, [%[a], #20]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * A[6] */ + "ldr r12, [%[a], #24]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * A[7] */ + "ldr r12, [%[a], #28]\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r12, r12\n\t" + "ldr %[r], [sp, #64]\n\t" + "add %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[r], [sp, #28]\n\t" + "ldm %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "umull r9, r10, %[r], %[r]\n\t" + "umull r11, r12, %[r], %[a]\n\t" + "adds r11, r11, r11\n\t" + "mov lr, #0\n\t" + "umaal r10, r11, lr, lr\n\t" + "stm sp, {r9, r10}\n\t" + "mov r8, lr\n\t" + "umaal r8, r12, %[r], r2\n\t" + "adcs r8, r8, r8\n\t" + "umaal r8, r11, %[a], %[a]\n\t" + "umull r9, r10, %[r], r3\n\t" + "umaal r9, r12, %[a], r2\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #8]\n\t" + "str r9, [sp, #12]\n\t" +#else + "strd r8, r9, [sp, #8]\n\t" +#endif + "mov r9, lr\n\t" + "umaal r9, r10, %[r], r4\n\t" + "umaal r9, r12, %[a], r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r2, r2\n\t" + "str r9, [sp, #16]\n\t" + "umull r9, r8, %[r], r5\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r9, r10, r2, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" + "str r9, [sp, #20]\n\t" + "mov r9, lr\n\t" + "umaal r9, r8, %[r], r6\n\t" + "umaal r9, r12, %[a], r5\n\t" + "umaal r9, r10, r2, r4\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r3, r3\n\t" + "str r9, [sp, #24]\n\t" + "umull %[r], r9, %[r], r7\n\t" + "umaal %[r], r8, %[a], r6\n\t" + "umaal %[r], r12, r2, r5\n\t" + "umaal %[r], r10, r3, r4\n\t" + "adcs %[r], %[r], %[r]\n\t" + "umaal %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "umaal r9, r8, %[a], r7\n\t" + "umaal r9, r10, r2, r6\n\t" + "umaal r12, r9, r3, r5\n\t" + "adcs r12, r12, r12\n\t" + "umaal r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "umaal r9, r8, r2, r7\n\t" + "umaal r10, r9, r3, r6\n\t" + "mov r2, lr\n\t" + "umaal r10, r2, r4, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "umaal r2, r8, r3, r7\n\t" + "umaal r2, r9, r4, r6\n\t" + "adcs r3, r2, r2\n\t" + "umaal r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "mov %[a], lr\n\t" + "umaal %[a], r8, r4, r7\n\t" + "umaal %[a], r9, r5, r6\n\t" + "adcs r4, %[a], %[a]\n\t" + "umaal r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "umaal r8, r9, r5, r7\n\t" + "adcs r8, r8, r8\n\t" + "umaal r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "mov r5, lr\n\t" + "umaal r5, r9, r6, r7\n\t" + "adcs r5, r5, r5\n\t" + "umaal r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "adcs r9, r9, r9\n\t" + "umaal r9, r5, r7, r7\n\t" + "adcs r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "ldr lr, [sp, #28]\n\t" + "add lr, lr, #28\n\t" + "stm lr!, {%[r], r12}\n\t" + "stm lr!, {r11}\n\t" + "stm lr!, {r10}\n\t" + "stm lr!, {r3, r4, r8, r9}\n\t" + "stm lr!, {r7}\n\t" + "sub lr, lr, #0x40\n\t" + "ldm sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "stm lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "add sp, sp, #32\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* Sub b from a into r. (r = a - b) * * r A single precision integer. @@ -4945,9 +5051,9 @@ static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_2048_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -5016,9 +5122,9 @@ SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) */ static sp_digit sp_2048_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -5101,9 +5207,9 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) */ static sp_digit sp_2048_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -5216,9 +5322,9 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) */ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -5254,16 +5360,15 @@ static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x100\n\t" "\n" "L_sp_2048_sub_in_pkace_64_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -5271,13 +5376,13 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_2048_sub_in_pkace_64_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -5292,9 +5397,9 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) */ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" @@ -5312,7 +5417,7 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "L_sp_2048_mul_64_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5382,12 +5487,11 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -5396,7 +5500,7 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "L_sp_2048_sqr_64_outer_%=: \n\t" "subs r3, r5, #0xfc\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_64_inner_%=: \n\t" @@ -5404,7 +5508,7 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_2048_sqr_64_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5457,7 +5561,7 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_2048_sqr_64_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5511,7 +5615,7 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_2048_sqr_64_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -5543,9 +5647,9 @@ static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -5581,16 +5685,15 @@ static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x80\n\t" "\n" "L_sp_2048_sub_in_pkace_32_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -5598,13 +5701,13 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_2048_sub_in_pkace_32_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -5619,9 +5722,9 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) */ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" @@ -5639,7 +5742,7 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "L_sp_2048_mul_32_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5709,12 +5812,11 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -5723,7 +5825,7 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "L_sp_2048_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_32_inner_%=: \n\t" @@ -5731,7 +5833,7 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_2048_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5784,7 +5886,7 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_2048_sqr_32_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -5838,7 +5940,7 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_2048_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -5874,15 +5976,14 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho) */ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -5915,7 +6016,7 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_2048_mul_d_64_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -5960,7 +6061,7 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #256]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -5973,15 +6074,14 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -6006,2491 +6106,75 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" "lsr r7, r7, #16\n\t" "mul r7, r6, r7\n\t" "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, #0\n\t" "lsr r7, r8, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" "lsr r6, %[b], #16\n\t" "lsr r7, r8, #16\n\t" "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "add r5, r5, r7\n\t" "lsl r7, r8, #16\n\t" "lsr r7, r7, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" "lsr r7, r7, #16\n\t" "mul r7, r6, r7\n\t" "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" + "adc r3, r3, #0\n\t" "lsr r7, r8, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" + "adc r3, r3, r7\n\t" "lsr r6, %[b], #16\n\t" "lsr r7, r8, #16\n\t" "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" + "add r3, r3, r7\n\t" "lsl r7, r8, #16\n\t" "lsr r7, r7, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" + "adc r3, r3, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" + "umlal r5, r3, %[b], r8\n\t" #endif - "str r5, [%[r]], #4\n\t" + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[32] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[33] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[34] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[35] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[36] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[37] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[38] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[39] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[40] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[41] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[42] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[43] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[44] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[45] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[46] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[47] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[48] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[49] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[50] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[51] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[52] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[53] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[54] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[55] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[56] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[57] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[58] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[59] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[60] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[61] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[62] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[63] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -8516,15 +6200,1933 @@ static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r3, r3, r6\n\t" "adc r4, r4, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r3, r3, r6\n\t" "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" #endif - "str r3, [%[r]], #4\n\t" + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[32] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[33] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[34] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[35] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[36] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[37] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[38] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[39] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[40] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[41] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[42] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[43] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[44] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[45] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[46] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[47] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[48] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[49] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[50] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[51] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[52] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[53] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[54] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[55] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[56] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[57] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[58] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[59] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[60] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[61] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[62] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[63] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" "str r4, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -8555,10 +8157,10 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) */ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -8595,10 +8197,10 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -8723,6 +8325,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp } #endif /* WOLFSSL_SP_SMALL */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -8731,12 +8334,12 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -8749,10 +8352,9 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -8776,14 +8378,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -8807,18 +8403,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -8842,17 +8432,11 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -8876,18 +8460,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -8911,18 +8489,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -8946,18 +8518,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -8981,18 +8547,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9016,18 +8576,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9051,18 +8605,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9086,18 +8634,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9121,18 +8663,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9156,18 +8692,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9191,18 +8721,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9226,18 +8750,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9261,18 +8779,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9296,18 +8808,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9331,18 +8837,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9366,18 +8866,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9401,18 +8895,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9436,18 +8924,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9471,18 +8953,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9506,18 +8982,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9541,18 +9011,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9576,18 +9040,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9611,18 +9069,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9646,18 +9098,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9681,18 +9127,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9716,18 +9156,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9751,18 +9185,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -9786,18 +9214,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -9821,22 +9243,16 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #124]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #124]\n\t" +#else + "ldr r7, [%[m], #124]\n\t" #endif "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -9867,13 +9283,6 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "ldr r10, [%[a], #128]\n\t" @@ -9885,6 +9294,7 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "add %[a], %[a], #4\n\t" "cmp r9, #0x80\n\t" "blt L_sp_2048_mont_reduce_32_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -9895,6 +9305,507 @@ static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_2048_mont_reduce_32_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "ldr r10, [%[a], #128]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x80\n\t" + "blt L_sp_2048_mont_reduce_32_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_2048_mont_reduce_32_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #128]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #124]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #128]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x80\n\t" + "blt L_sp_2048_mont_reduce_32_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -9934,15 +9845,14 @@ SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, */ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -9975,7 +9885,7 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_2048_mul_d_32_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -10020,7 +9930,7 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #128]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -10033,15 +9943,14 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -10066,1211 +9975,11 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -11296,15 +10005,973 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[2] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" "str r5, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -11321,9 +10988,9 @@ static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -11380,9 +11047,9 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -11412,7 +11079,7 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_2048_word_32_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -11440,7 +11107,7 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -11468,7 +11135,7 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -11516,8 +11183,8 @@ static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -12312,10 +11979,10 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) */ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -12352,10 +12019,10 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -12592,6 +12259,7 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp } #endif /* WOLFSSL_SP_SMALL */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -12600,12 +12268,12 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp */ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -12618,10 +12286,9 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -12645,14 +12312,8 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -12676,18 +12337,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -12711,17 +12366,11 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -12745,18 +12394,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -12780,18 +12423,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -12815,18 +12452,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -12850,18 +12481,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -12885,18 +12510,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -12920,18 +12539,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -12955,18 +12568,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -12990,18 +12597,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13025,18 +12626,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13060,18 +12655,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13095,18 +12684,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13130,18 +12713,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13165,18 +12742,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13200,18 +12771,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13235,18 +12800,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13270,18 +12829,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13305,18 +12858,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13340,18 +12887,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13375,18 +12916,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13410,18 +12945,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13445,18 +12974,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13480,18 +13003,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13515,18 +13032,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13550,18 +13061,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13585,18 +13090,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13620,18 +13119,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13655,18 +13148,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13690,18 +13177,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ "ldr r7, [%[m], #124]\n\t" "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13725,18 +13206,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "adc r4, r4, #0\n\t" /* a[i+32] += m[32] * mu */ "ldr r7, [%[m], #128]\n\t" "ldr r10, [%[a], #128]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13760,18 +13235,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #128]\n\t" "adc r5, r5, #0\n\t" /* a[i+33] += m[33] * mu */ "ldr r7, [%[m], #132]\n\t" "ldr r10, [%[a], #132]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13795,18 +13264,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #132]\n\t" "adc r4, r4, #0\n\t" /* a[i+34] += m[34] * mu */ "ldr r7, [%[m], #136]\n\t" "ldr r10, [%[a], #136]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13830,18 +13293,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #136]\n\t" "adc r5, r5, #0\n\t" /* a[i+35] += m[35] * mu */ "ldr r7, [%[m], #140]\n\t" "ldr r10, [%[a], #140]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13865,18 +13322,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #140]\n\t" "adc r4, r4, #0\n\t" /* a[i+36] += m[36] * mu */ "ldr r7, [%[m], #144]\n\t" "ldr r10, [%[a], #144]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13900,18 +13351,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #144]\n\t" "adc r5, r5, #0\n\t" /* a[i+37] += m[37] * mu */ "ldr r7, [%[m], #148]\n\t" "ldr r10, [%[a], #148]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -13935,18 +13380,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #148]\n\t" "adc r4, r4, #0\n\t" /* a[i+38] += m[38] * mu */ "ldr r7, [%[m], #152]\n\t" "ldr r10, [%[a], #152]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -13970,18 +13409,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #152]\n\t" "adc r5, r5, #0\n\t" /* a[i+39] += m[39] * mu */ "ldr r7, [%[m], #156]\n\t" "ldr r10, [%[a], #156]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14005,18 +13438,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #156]\n\t" "adc r4, r4, #0\n\t" /* a[i+40] += m[40] * mu */ "ldr r7, [%[m], #160]\n\t" "ldr r10, [%[a], #160]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14040,18 +13467,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #160]\n\t" "adc r5, r5, #0\n\t" /* a[i+41] += m[41] * mu */ "ldr r7, [%[m], #164]\n\t" "ldr r10, [%[a], #164]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14075,18 +13496,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #164]\n\t" "adc r4, r4, #0\n\t" /* a[i+42] += m[42] * mu */ "ldr r7, [%[m], #168]\n\t" "ldr r10, [%[a], #168]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14110,18 +13525,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #168]\n\t" "adc r5, r5, #0\n\t" /* a[i+43] += m[43] * mu */ "ldr r7, [%[m], #172]\n\t" "ldr r10, [%[a], #172]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14145,18 +13554,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #172]\n\t" "adc r4, r4, #0\n\t" /* a[i+44] += m[44] * mu */ "ldr r7, [%[m], #176]\n\t" "ldr r10, [%[a], #176]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14180,18 +13583,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #176]\n\t" "adc r5, r5, #0\n\t" /* a[i+45] += m[45] * mu */ "ldr r7, [%[m], #180]\n\t" "ldr r10, [%[a], #180]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14215,18 +13612,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #180]\n\t" "adc r4, r4, #0\n\t" /* a[i+46] += m[46] * mu */ "ldr r7, [%[m], #184]\n\t" "ldr r10, [%[a], #184]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14250,18 +13641,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #184]\n\t" "adc r5, r5, #0\n\t" /* a[i+47] += m[47] * mu */ "ldr r7, [%[m], #188]\n\t" "ldr r10, [%[a], #188]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14285,18 +13670,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #188]\n\t" "adc r4, r4, #0\n\t" /* a[i+48] += m[48] * mu */ "ldr r7, [%[m], #192]\n\t" "ldr r10, [%[a], #192]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14320,18 +13699,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #192]\n\t" "adc r5, r5, #0\n\t" /* a[i+49] += m[49] * mu */ "ldr r7, [%[m], #196]\n\t" "ldr r10, [%[a], #196]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14355,18 +13728,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #196]\n\t" "adc r4, r4, #0\n\t" /* a[i+50] += m[50] * mu */ "ldr r7, [%[m], #200]\n\t" "ldr r10, [%[a], #200]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14390,18 +13757,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #200]\n\t" "adc r5, r5, #0\n\t" /* a[i+51] += m[51] * mu */ "ldr r7, [%[m], #204]\n\t" "ldr r10, [%[a], #204]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14425,18 +13786,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #204]\n\t" "adc r4, r4, #0\n\t" /* a[i+52] += m[52] * mu */ "ldr r7, [%[m], #208]\n\t" "ldr r10, [%[a], #208]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14460,18 +13815,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #208]\n\t" "adc r5, r5, #0\n\t" /* a[i+53] += m[53] * mu */ "ldr r7, [%[m], #212]\n\t" "ldr r10, [%[a], #212]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14495,18 +13844,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #212]\n\t" "adc r4, r4, #0\n\t" /* a[i+54] += m[54] * mu */ "ldr r7, [%[m], #216]\n\t" "ldr r10, [%[a], #216]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14530,18 +13873,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #216]\n\t" "adc r5, r5, #0\n\t" /* a[i+55] += m[55] * mu */ "ldr r7, [%[m], #220]\n\t" "ldr r10, [%[a], #220]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14565,18 +13902,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #220]\n\t" "adc r4, r4, #0\n\t" /* a[i+56] += m[56] * mu */ "ldr r7, [%[m], #224]\n\t" "ldr r10, [%[a], #224]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14600,18 +13931,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #224]\n\t" "adc r5, r5, #0\n\t" /* a[i+57] += m[57] * mu */ "ldr r7, [%[m], #228]\n\t" "ldr r10, [%[a], #228]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14635,18 +13960,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #228]\n\t" "adc r4, r4, #0\n\t" /* a[i+58] += m[58] * mu */ "ldr r7, [%[m], #232]\n\t" "ldr r10, [%[a], #232]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14670,18 +13989,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #232]\n\t" "adc r5, r5, #0\n\t" /* a[i+59] += m[59] * mu */ "ldr r7, [%[m], #236]\n\t" "ldr r10, [%[a], #236]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14705,18 +14018,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #236]\n\t" "adc r4, r4, #0\n\t" /* a[i+60] += m[60] * mu */ "ldr r7, [%[m], #240]\n\t" "ldr r10, [%[a], #240]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14740,18 +14047,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #240]\n\t" "adc r5, r5, #0\n\t" /* a[i+61] += m[61] * mu */ "ldr r7, [%[m], #244]\n\t" "ldr r10, [%[a], #244]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -14775,18 +14076,12 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #244]\n\t" "adc r4, r4, #0\n\t" /* a[i+62] += m[62] * mu */ "ldr r7, [%[m], #248]\n\t" "ldr r10, [%[a], #248]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -14810,22 +14105,16 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #248]\n\t" "adc r5, r5, #0\n\t" /* a[i+63] += m[63] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #252]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #252]\n\t" +#else + "ldr r7, [%[m], #252]\n\t" #endif "ldr r10, [%[a], #252]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -14856,13 +14145,6 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #252]\n\t" "ldr r10, [%[a], #256]\n\t" @@ -14874,6 +14156,7 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ "add %[a], %[a], #4\n\t" "cmp r9, #0x100\n\t" "blt L_sp_2048_mont_reduce_64_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -14884,6 +14167,923 @@ static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_ sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_2048_mont_reduce_64_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r7, [%[m], #128]\n\t" + "ldr r10, [%[a], #128]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r7, [%[m], #132]\n\t" + "ldr r10, [%[a], #132]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #132]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r7, [%[m], #136]\n\t" + "ldr r10, [%[a], #136]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #136]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r7, [%[m], #140]\n\t" + "ldr r10, [%[a], #140]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #140]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r7, [%[m], #144]\n\t" + "ldr r10, [%[a], #144]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #144]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r7, [%[m], #148]\n\t" + "ldr r10, [%[a], #148]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #148]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r7, [%[m], #152]\n\t" + "ldr r10, [%[a], #152]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #152]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r7, [%[m], #156]\n\t" + "ldr r10, [%[a], #156]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #156]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r7, [%[m], #160]\n\t" + "ldr r10, [%[a], #160]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #160]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r7, [%[m], #164]\n\t" + "ldr r10, [%[a], #164]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #164]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r7, [%[m], #168]\n\t" + "ldr r10, [%[a], #168]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #168]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r7, [%[m], #172]\n\t" + "ldr r10, [%[a], #172]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #172]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r7, [%[m], #176]\n\t" + "ldr r10, [%[a], #176]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #176]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r7, [%[m], #180]\n\t" + "ldr r10, [%[a], #180]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #180]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r7, [%[m], #184]\n\t" + "ldr r10, [%[a], #184]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #184]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r7, [%[m], #188]\n\t" + "ldr r10, [%[a], #188]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #188]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r7, [%[m], #192]\n\t" + "ldr r10, [%[a], #192]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #192]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r7, [%[m], #196]\n\t" + "ldr r10, [%[a], #196]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #196]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r7, [%[m], #200]\n\t" + "ldr r10, [%[a], #200]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #200]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r7, [%[m], #204]\n\t" + "ldr r10, [%[a], #204]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #204]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r7, [%[m], #208]\n\t" + "ldr r10, [%[a], #208]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #208]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r7, [%[m], #212]\n\t" + "ldr r10, [%[a], #212]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #212]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r7, [%[m], #216]\n\t" + "ldr r10, [%[a], #216]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #216]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r7, [%[m], #220]\n\t" + "ldr r10, [%[a], #220]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #220]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r7, [%[m], #224]\n\t" + "ldr r10, [%[a], #224]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #224]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r7, [%[m], #228]\n\t" + "ldr r10, [%[a], #228]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #228]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r7, [%[m], #232]\n\t" + "ldr r10, [%[a], #232]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #232]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r7, [%[m], #236]\n\t" + "ldr r10, [%[a], #236]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #236]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r7, [%[m], #240]\n\t" + "ldr r10, [%[a], #240]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #240]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r7, [%[m], #244]\n\t" + "ldr r10, [%[a], #244]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #244]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r7, [%[m], #248]\n\t" + "ldr r10, [%[a], #248]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #248]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r7, [%[m], #252]\n\t" + "ldr r10, [%[a], #252]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #252]\n\t" + "ldr r10, [%[a], #256]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #256]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x100\n\t" + "blt L_sp_2048_mont_reduce_64_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_2048_mont_reduce_64_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r10, [%[m], #128]\n\t" + "ldr r9, [%[a], #128]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r10, [%[m], #132]\n\t" + "ldr r9, [%[a], #132]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r10, [%[m], #136]\n\t" + "ldr r9, [%[a], #136]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r10, [%[m], #140]\n\t" + "ldr r9, [%[a], #140]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r10, [%[m], #144]\n\t" + "ldr r9, [%[a], #144]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r10, [%[m], #148]\n\t" + "ldr r9, [%[a], #148]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r10, [%[m], #152]\n\t" + "ldr r9, [%[a], #152]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r10, [%[m], #156]\n\t" + "ldr r9, [%[a], #156]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r10, [%[m], #160]\n\t" + "ldr r9, [%[a], #160]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r10, [%[m], #164]\n\t" + "ldr r9, [%[a], #164]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r10, [%[m], #168]\n\t" + "ldr r9, [%[a], #168]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r10, [%[m], #172]\n\t" + "ldr r9, [%[a], #172]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r10, [%[m], #176]\n\t" + "ldr r9, [%[a], #176]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r10, [%[m], #180]\n\t" + "ldr r9, [%[a], #180]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r10, [%[m], #184]\n\t" + "ldr r9, [%[a], #184]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r10, [%[m], #188]\n\t" + "ldr r9, [%[a], #188]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r10, [%[m], #192]\n\t" + "ldr r9, [%[a], #192]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r10, [%[m], #196]\n\t" + "ldr r9, [%[a], #196]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r10, [%[m], #200]\n\t" + "ldr r9, [%[a], #200]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r10, [%[m], #204]\n\t" + "ldr r9, [%[a], #204]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r10, [%[m], #208]\n\t" + "ldr r9, [%[a], #208]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r10, [%[m], #212]\n\t" + "ldr r9, [%[a], #212]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r10, [%[m], #216]\n\t" + "ldr r9, [%[a], #216]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r10, [%[m], #220]\n\t" + "ldr r9, [%[a], #220]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r10, [%[m], #224]\n\t" + "ldr r9, [%[a], #224]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r10, [%[m], #228]\n\t" + "ldr r9, [%[a], #228]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r10, [%[m], #232]\n\t" + "ldr r9, [%[a], #232]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r10, [%[m], #236]\n\t" + "ldr r9, [%[a], #236]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r10, [%[m], #240]\n\t" + "ldr r9, [%[a], #240]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r10, [%[m], #244]\n\t" + "ldr r9, [%[a], #244]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r10, [%[m], #248]\n\t" + "ldr r9, [%[a], #248]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r10, [%[m], #252]\n\t" + "ldr r9, [%[a], #252]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #256]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #252]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #256]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x100\n\t" + "blt L_sp_2048_mont_reduce_64_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -14923,9 +15123,9 @@ SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, */ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -14960,9 +15160,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -15098,9 +15298,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -15157,9 +15357,9 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -15189,7 +15389,7 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_2048_word_64_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -15217,7 +15417,7 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -15245,7 +15445,7 @@ static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -15396,8 +15596,8 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -16667,10 +16867,10 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, */ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -16707,10 +16907,10 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r8, #0\n\t" @@ -17151,9 +17351,9 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_FFDHE_2048 static void sp_2048_lshift_64(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #31\n\t" @@ -17845,14 +18045,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -17965,17 +18165,16 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a) */ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #48\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18005,7 +18204,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18044,7 +18243,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18082,7 +18281,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18122,7 +18321,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18160,7 +18359,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18198,7 +18397,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18237,7 +18436,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18274,7 +18473,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18312,7 +18511,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18350,7 +18549,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18389,7 +18588,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18427,7 +18626,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18465,7 +18664,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18503,7 +18702,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18541,7 +18740,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18581,7 +18780,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18618,7 +18817,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18655,7 +18854,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18693,7 +18892,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18731,7 +18930,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18769,7 +18968,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18809,7 +19008,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18846,7 +19045,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18884,7 +19083,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18922,7 +19121,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18960,7 +19159,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -18998,7 +19197,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19036,7 +19235,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19076,7 +19275,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19114,7 +19313,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19151,7 +19350,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19188,7 +19387,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19226,7 +19425,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19264,7 +19463,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19302,7 +19501,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19340,7 +19539,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #28]\n\t" /* A[8] * B[0] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19380,7 +19579,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[1] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19418,7 +19617,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19455,7 +19654,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19493,7 +19692,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19531,7 +19730,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19569,7 +19768,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19607,7 +19806,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19645,7 +19844,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[8] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19683,7 +19882,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #32]\n\t" /* A[0] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19723,7 +19922,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[8] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19761,7 +19960,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19799,7 +19998,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19836,7 +20035,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19873,7 +20072,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19911,7 +20110,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19949,7 +20148,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -19987,7 +20186,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[1] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20025,7 +20224,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[0] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20063,7 +20262,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #36]\n\t" /* A[10] * B[0] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20103,7 +20302,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[1] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20141,7 +20340,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[2] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20179,7 +20378,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[3] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20216,7 +20415,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20254,7 +20453,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20292,7 +20491,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20330,7 +20529,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20368,7 +20567,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[8] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20406,7 +20605,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[9] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20444,7 +20643,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[10] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20482,7 +20681,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #40]\n\t" /* A[0] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20522,7 +20721,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[10] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20560,7 +20759,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[9] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20598,7 +20797,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[8] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20636,7 +20835,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20673,7 +20872,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20710,7 +20909,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20748,7 +20947,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20786,7 +20985,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[3] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20824,7 +21023,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[2] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20862,7 +21061,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[1] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20900,7 +21099,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[0] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20938,7 +21137,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #44]\n\t" /* A[11] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -20978,7 +21177,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[2] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21016,7 +21215,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[3] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21054,7 +21253,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[4] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21091,7 +21290,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[5] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21129,7 +21328,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21167,7 +21366,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21205,7 +21404,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[8] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21243,7 +21442,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[9] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21281,7 +21480,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[10] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21319,7 +21518,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[11] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21357,7 +21556,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #48]\n\t" /* A[2] * B[11] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21397,7 +21596,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[10] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21435,7 +21634,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[9] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21473,7 +21672,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[8] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21510,7 +21709,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21547,7 +21746,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21585,7 +21784,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[5] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21623,7 +21822,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[4] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21661,7 +21860,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[3] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21699,7 +21898,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[2] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21737,7 +21936,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #52]\n\t" /* A[11] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21777,7 +21976,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[4] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21815,7 +22014,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[5] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21852,7 +22051,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[6] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21890,7 +22089,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[7] */ "ldr r11, [%[a], #28]\n\t" "ldr r12, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21928,7 +22127,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[8] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -21966,7 +22165,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[9] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22004,7 +22203,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[10] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22042,7 +22241,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[11] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22080,7 +22279,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #56]\n\t" /* A[4] * B[11] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22120,7 +22319,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[10] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22158,7 +22357,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[9] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22195,7 +22394,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[8] */ "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22232,7 +22431,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[7] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22270,7 +22469,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[6] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22308,7 +22507,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[5] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22346,7 +22545,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[4] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22384,7 +22583,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #60]\n\t" /* A[11] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22424,7 +22623,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[6] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22461,7 +22660,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[7] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22499,7 +22698,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[8] */ "ldr r11, [%[a], #32]\n\t" "ldr r12, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22537,7 +22736,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[9] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22575,7 +22774,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[10] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22613,7 +22812,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[11] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22651,7 +22850,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #64]\n\t" /* A[6] * B[11] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22691,7 +22890,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[10] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22728,7 +22927,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22765,7 +22964,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[8] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22803,7 +23002,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[7] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22841,7 +23040,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[6] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22879,7 +23078,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #68]\n\t" /* A[11] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22918,7 +23117,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[10] * B[8] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22956,7 +23155,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[9] */ "ldr r11, [%[a], #36]\n\t" "ldr r12, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -22994,7 +23193,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[10] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23032,7 +23231,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[11] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23070,7 +23269,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #72]\n\t" /* A[8] * B[11] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23109,7 +23308,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[10] */ "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23146,7 +23345,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[10] * B[9] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23184,7 +23383,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[8] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23221,7 +23420,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r4, [%[r], #76]\n\t" /* A[11] * B[9] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23261,7 +23460,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[10] */ "ldr r11, [%[a], #40]\n\t" "ldr r12, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23299,7 +23498,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[11] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23336,7 +23535,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r5, [%[r], #80]\n\t" /* A[10] * B[11] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23375,7 +23574,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[11] * B[10] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23412,7 +23611,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r3, [%[r], #84]\n\t" /* A[11] * B[11] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -23438,9 +23637,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r7\n\t" + "umlal r4, r5, r8, r9\n\t" #endif "str r4, [%[r], #88]\n\t" "str r5, [%[r], #92]\n\t" @@ -23452,7 +23649,7 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "stm %[r]!, {r3, r4, r5, r6}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } @@ -23464,12 +23661,11 @@ static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -23491,10 +23687,11 @@ static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -23506,8 +23703,8 @@ static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -23568,12 +23765,11 @@ static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_3072_add_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -23616,10 +23812,11 @@ static sp_digit sp_3072_add_24(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -23700,8 +23897,8 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, */ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -23804,12 +24001,11 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -23894,10 +24090,11 @@ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -23978,8 +24175,8 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, */ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -24166,12 +24363,11 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -24340,10 +24536,11 @@ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -24424,14 +24621,14 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, */ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #48\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -24450,7 +24647,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24506,7 +24703,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24560,7 +24757,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24590,7 +24787,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24645,7 +24842,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24698,7 +24895,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24753,7 +24950,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24804,7 +25001,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24834,7 +25031,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24864,7 +25061,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24902,7 +25099,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24947,7 +25144,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -24977,7 +25174,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25015,7 +25212,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25052,7 +25249,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25091,7 +25288,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25121,7 +25318,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25159,7 +25356,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25197,7 +25394,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25242,7 +25439,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25272,7 +25469,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25310,7 +25507,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25348,7 +25545,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25385,7 +25582,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25424,7 +25621,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25454,7 +25651,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25492,7 +25689,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25530,7 +25727,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25568,7 +25765,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25613,7 +25810,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25643,7 +25840,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25681,7 +25878,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25719,7 +25916,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25757,7 +25954,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25794,7 +25991,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25833,7 +26030,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25863,7 +26060,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25901,7 +26098,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25939,7 +26136,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -25977,7 +26174,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26015,7 +26212,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26060,7 +26257,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26090,7 +26287,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26128,7 +26325,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26166,7 +26363,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26204,7 +26401,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26241,7 +26438,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26280,7 +26477,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26310,7 +26507,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26348,7 +26545,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26386,7 +26583,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26424,7 +26621,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26469,7 +26666,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26499,7 +26696,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26537,7 +26734,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26575,7 +26772,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26612,7 +26809,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26651,7 +26848,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26681,7 +26878,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26719,7 +26916,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26757,7 +26954,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26802,7 +26999,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26832,7 +27029,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26870,7 +27067,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26907,7 +27104,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[8] * A[8] */ "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26946,7 +27143,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -26976,7 +27173,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27014,7 +27211,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27059,7 +27256,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27114,7 +27311,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27165,7 +27362,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[9] * A[9] */ "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27195,7 +27392,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27250,7 +27447,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27303,7 +27500,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27357,7 +27554,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[10] * A[10] */ "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27387,7 +27584,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27442,7 +27639,7 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) "str r2, [%[r], #84]\n\t" /* A[11] * A[11] */ "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -27487,9 +27684,9 @@ static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_3072_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -27565,9 +27762,9 @@ SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) */ static sp_digit sp_3072_sub_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -27664,9 +27861,9 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) */ static sp_digit sp_3072_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -27807,9 +28004,9 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) */ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -27845,16 +28042,15 @@ static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x180\n\t" "\n" "L_sp_3072_sub_in_pkace_96_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -27862,13 +28058,13 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_3072_sub_in_pkace_96_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -27883,9 +28079,9 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) */ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" @@ -27903,7 +28099,7 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "L_sp_3072_mul_96_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -27973,12 +28169,11 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -27987,7 +28182,7 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "L_sp_3072_sqr_96_outer_%=: \n\t" "subs r3, r5, #0x17c\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_96_inner_%=: \n\t" @@ -27995,7 +28190,7 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_3072_sqr_96_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -28048,7 +28243,7 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_3072_sqr_96_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -28102,7 +28297,7 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_3072_sqr_96_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -28134,9 +28329,9 @@ static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -28172,16 +28367,15 @@ static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0xc0\n\t" "\n" "L_sp_3072_sub_in_pkace_48_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -28189,13 +28383,13 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_3072_sub_in_pkace_48_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -28210,9 +28404,9 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) */ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" @@ -28230,7 +28424,7 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "L_sp_3072_mul_48_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -28300,12 +28494,11 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -28314,7 +28507,7 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "L_sp_3072_sqr_48_outer_%=: \n\t" "subs r3, r5, #0xbc\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_48_inner_%=: \n\t" @@ -28322,7 +28515,7 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_3072_sqr_48_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -28375,7 +28568,7 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_3072_sqr_48_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -28429,7 +28622,7 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_3072_sqr_48_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -28465,15 +28658,14 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho) */ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -28506,7 +28698,7 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_3072_mul_d_96_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -28551,7 +28743,7 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #384]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -28564,15 +28756,14 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -28597,3771 +28788,43 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" "lsr r7, r7, #16\n\t" "mul r7, r6, r7\n\t" "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, #0\n\t" "lsr r7, r8, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" "lsr r6, %[b], #16\n\t" "lsr r7, r8, #16\n\t" "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "add r5, r5, r7\n\t" "lsl r7, r8, #16\n\t" "lsr r7, r7, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[32] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[33] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[34] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[35] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[36] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[37] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[38] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[39] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[40] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[41] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[42] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[43] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[44] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[45] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[46] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[47] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[48] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[49] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[50] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[51] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[52] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[53] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[54] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[55] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[56] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[57] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[58] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[59] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[60] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[61] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[62] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[63] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[64] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[65] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[66] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[67] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[68] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[69] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[70] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[71] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[72] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[73] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[74] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[75] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[76] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[77] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[78] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[79] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[80] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[81] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[82] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[83] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[84] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[85] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[86] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[87] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[88] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[89] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[90] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[91] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[92] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[93] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[94] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[95] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -32387,15 +28850,2989 @@ static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" #endif - "str r5, [%[r]], #4\n\t" + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[32] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[33] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[34] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[35] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[36] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[37] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[38] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[39] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[40] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[41] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[42] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[43] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[44] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[45] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[46] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[47] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[48] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[49] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[50] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[51] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[52] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[53] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[54] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[55] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[56] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[57] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[58] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[59] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[60] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[61] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[62] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[63] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[64] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[65] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[66] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[67] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[68] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[69] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[70] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[71] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[72] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[73] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[74] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[75] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[76] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[77] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[78] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[79] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[80] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[81] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[82] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[83] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[84] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[85] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[86] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[87] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[88] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[89] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[90] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[91] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[92] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[93] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[94] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[95] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" "str r3, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -32426,10 +31863,10 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) */ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -32466,10 +31903,10 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -32650,6 +32087,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp } #endif /* WOLFSSL_SP_SMALL */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -32658,12 +32096,12 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp */ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -32676,10 +32114,9 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -32703,14 +32140,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -32734,18 +32165,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -32769,17 +32194,11 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -32803,18 +32222,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -32838,18 +32251,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -32873,18 +32280,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -32908,18 +32309,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -32943,18 +32338,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -32978,18 +32367,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33013,18 +32396,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33048,18 +32425,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33083,18 +32454,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33118,18 +32483,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33153,18 +32512,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33188,18 +32541,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33223,18 +32570,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33258,18 +32599,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33293,18 +32628,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33328,18 +32657,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33363,18 +32686,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33398,18 +32715,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33433,18 +32744,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33468,18 +32773,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33503,18 +32802,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33538,18 +32831,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33573,18 +32860,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33608,18 +32889,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33643,18 +32918,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33678,18 +32947,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33713,18 +32976,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33748,18 +33005,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ "ldr r7, [%[m], #124]\n\t" "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33783,18 +33034,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "adc r4, r4, #0\n\t" /* a[i+32] += m[32] * mu */ "ldr r7, [%[m], #128]\n\t" "ldr r10, [%[a], #128]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33818,18 +33063,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #128]\n\t" "adc r5, r5, #0\n\t" /* a[i+33] += m[33] * mu */ "ldr r7, [%[m], #132]\n\t" "ldr r10, [%[a], #132]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33853,18 +33092,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #132]\n\t" "adc r4, r4, #0\n\t" /* a[i+34] += m[34] * mu */ "ldr r7, [%[m], #136]\n\t" "ldr r10, [%[a], #136]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33888,18 +33121,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #136]\n\t" "adc r5, r5, #0\n\t" /* a[i+35] += m[35] * mu */ "ldr r7, [%[m], #140]\n\t" "ldr r10, [%[a], #140]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33923,18 +33150,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #140]\n\t" "adc r4, r4, #0\n\t" /* a[i+36] += m[36] * mu */ "ldr r7, [%[m], #144]\n\t" "ldr r10, [%[a], #144]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -33958,18 +33179,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #144]\n\t" "adc r5, r5, #0\n\t" /* a[i+37] += m[37] * mu */ "ldr r7, [%[m], #148]\n\t" "ldr r10, [%[a], #148]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -33993,18 +33208,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #148]\n\t" "adc r4, r4, #0\n\t" /* a[i+38] += m[38] * mu */ "ldr r7, [%[m], #152]\n\t" "ldr r10, [%[a], #152]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -34028,18 +33237,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #152]\n\t" "adc r5, r5, #0\n\t" /* a[i+39] += m[39] * mu */ "ldr r7, [%[m], #156]\n\t" "ldr r10, [%[a], #156]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -34063,18 +33266,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #156]\n\t" "adc r4, r4, #0\n\t" /* a[i+40] += m[40] * mu */ "ldr r7, [%[m], #160]\n\t" "ldr r10, [%[a], #160]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -34098,18 +33295,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #160]\n\t" "adc r5, r5, #0\n\t" /* a[i+41] += m[41] * mu */ "ldr r7, [%[m], #164]\n\t" "ldr r10, [%[a], #164]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -34133,18 +33324,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #164]\n\t" "adc r4, r4, #0\n\t" /* a[i+42] += m[42] * mu */ "ldr r7, [%[m], #168]\n\t" "ldr r10, [%[a], #168]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -34168,18 +33353,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #168]\n\t" "adc r5, r5, #0\n\t" /* a[i+43] += m[43] * mu */ "ldr r7, [%[m], #172]\n\t" "ldr r10, [%[a], #172]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -34203,18 +33382,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #172]\n\t" "adc r4, r4, #0\n\t" /* a[i+44] += m[44] * mu */ "ldr r7, [%[m], #176]\n\t" "ldr r10, [%[a], #176]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -34238,18 +33411,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #176]\n\t" "adc r5, r5, #0\n\t" /* a[i+45] += m[45] * mu */ "ldr r7, [%[m], #180]\n\t" "ldr r10, [%[a], #180]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -34273,18 +33440,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #180]\n\t" "adc r4, r4, #0\n\t" /* a[i+46] += m[46] * mu */ "ldr r7, [%[m], #184]\n\t" "ldr r10, [%[a], #184]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -34308,22 +33469,16 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #184]\n\t" "adc r5, r5, #0\n\t" /* a[i+47] += m[47] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #188]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #188]\n\t" +#else + "ldr r7, [%[m], #188]\n\t" #endif "ldr r10, [%[a], #188]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -34354,13 +33509,6 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #188]\n\t" "ldr r10, [%[a], #192]\n\t" @@ -34372,6 +33520,7 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ "add %[a], %[a], #4\n\t" "cmp r9, #0xc0\n\t" "blt L_sp_3072_mont_reduce_48_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -34382,6 +33531,715 @@ static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_ sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_3072_mont_reduce_48_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r7, [%[m], #128]\n\t" + "ldr r10, [%[a], #128]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r7, [%[m], #132]\n\t" + "ldr r10, [%[a], #132]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #132]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r7, [%[m], #136]\n\t" + "ldr r10, [%[a], #136]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #136]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r7, [%[m], #140]\n\t" + "ldr r10, [%[a], #140]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #140]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r7, [%[m], #144]\n\t" + "ldr r10, [%[a], #144]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #144]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r7, [%[m], #148]\n\t" + "ldr r10, [%[a], #148]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #148]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r7, [%[m], #152]\n\t" + "ldr r10, [%[a], #152]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #152]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r7, [%[m], #156]\n\t" + "ldr r10, [%[a], #156]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #156]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r7, [%[m], #160]\n\t" + "ldr r10, [%[a], #160]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #160]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r7, [%[m], #164]\n\t" + "ldr r10, [%[a], #164]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #164]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r7, [%[m], #168]\n\t" + "ldr r10, [%[a], #168]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #168]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r7, [%[m], #172]\n\t" + "ldr r10, [%[a], #172]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #172]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r7, [%[m], #176]\n\t" + "ldr r10, [%[a], #176]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #176]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r7, [%[m], #180]\n\t" + "ldr r10, [%[a], #180]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #180]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r7, [%[m], #184]\n\t" + "ldr r10, [%[a], #184]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #184]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r7, [%[m], #188]\n\t" + "ldr r10, [%[a], #188]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #188]\n\t" + "ldr r10, [%[a], #192]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #192]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0xc0\n\t" + "blt L_sp_3072_mont_reduce_48_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_3072_mont_reduce_48_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r10, [%[m], #128]\n\t" + "ldr r9, [%[a], #128]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r10, [%[m], #132]\n\t" + "ldr r9, [%[a], #132]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r10, [%[m], #136]\n\t" + "ldr r9, [%[a], #136]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r10, [%[m], #140]\n\t" + "ldr r9, [%[a], #140]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r10, [%[m], #144]\n\t" + "ldr r9, [%[a], #144]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r10, [%[m], #148]\n\t" + "ldr r9, [%[a], #148]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r10, [%[m], #152]\n\t" + "ldr r9, [%[a], #152]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r10, [%[m], #156]\n\t" + "ldr r9, [%[a], #156]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r10, [%[m], #160]\n\t" + "ldr r9, [%[a], #160]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r10, [%[m], #164]\n\t" + "ldr r9, [%[a], #164]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r10, [%[m], #168]\n\t" + "ldr r9, [%[a], #168]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r10, [%[m], #172]\n\t" + "ldr r9, [%[a], #172]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r10, [%[m], #176]\n\t" + "ldr r9, [%[a], #176]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r10, [%[m], #180]\n\t" + "ldr r9, [%[a], #180]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r10, [%[m], #184]\n\t" + "ldr r9, [%[a], #184]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r10, [%[m], #188]\n\t" + "ldr r9, [%[a], #188]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #192]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #188]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #192]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0xc0\n\t" + "blt L_sp_3072_mont_reduce_48_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -34421,15 +34279,14 @@ SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, */ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -34462,7 +34319,7 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_3072_mul_d_48_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -34507,7 +34364,7 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #192]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -34520,15 +34377,14 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -34553,1851 +34409,43 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" "lsr r7, r7, #16\n\t" "mul r7, r6, r7\n\t" "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, #0\n\t" "lsr r7, r8, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" "lsr r6, %[b], #16\n\t" "lsr r7, r8, #16\n\t" "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "add r5, r5, r7\n\t" "lsl r7, r8, #16\n\t" "lsr r7, r7, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[32] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[33] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[34] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[35] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[36] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[37] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[38] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[39] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[40] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[41] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[42] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[43] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[44] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[45] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[46] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[47] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -36423,15 +34471,1453 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" #endif - "str r5, [%[r]], #4\n\t" + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[32] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[33] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[34] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[35] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[36] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[37] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[38] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[39] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[40] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[41] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[42] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[43] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[44] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[45] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[46] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[47] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" "str r3, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -36448,9 +35934,9 @@ static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -36507,9 +35993,9 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -36539,7 +36025,7 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_3072_word_48_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -36567,7 +36053,7 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -36595,7 +36081,7 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -36643,8 +36129,8 @@ static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -37615,10 +37101,10 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) */ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -37655,10 +37141,10 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -38007,6 +37493,7 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp } #endif /* WOLFSSL_SP_SMALL */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -38015,12 +37502,12 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp */ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -38033,10 +37520,9 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -38060,14 +37546,8 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -38091,18 +37571,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -38126,17 +37600,11 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38160,18 +37628,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38195,18 +37657,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38230,18 +37686,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38265,18 +37715,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38300,18 +37744,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38335,18 +37773,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38370,18 +37802,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38405,18 +37831,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38440,18 +37860,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38475,18 +37889,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38510,18 +37918,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38545,18 +37947,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38580,18 +37976,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38615,18 +38005,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38650,18 +38034,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38685,18 +38063,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38720,18 +38092,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38755,18 +38121,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38790,18 +38150,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38825,18 +38179,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38860,18 +38208,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38895,18 +38237,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -38930,18 +38266,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -38965,18 +38295,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39000,18 +38324,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39035,18 +38353,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39070,18 +38382,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39105,18 +38411,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ "ldr r7, [%[m], #124]\n\t" "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39140,18 +38440,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "adc r4, r4, #0\n\t" /* a[i+32] += m[32] * mu */ "ldr r7, [%[m], #128]\n\t" "ldr r10, [%[a], #128]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39175,18 +38469,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #128]\n\t" "adc r5, r5, #0\n\t" /* a[i+33] += m[33] * mu */ "ldr r7, [%[m], #132]\n\t" "ldr r10, [%[a], #132]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39210,18 +38498,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #132]\n\t" "adc r4, r4, #0\n\t" /* a[i+34] += m[34] * mu */ "ldr r7, [%[m], #136]\n\t" "ldr r10, [%[a], #136]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39245,18 +38527,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #136]\n\t" "adc r5, r5, #0\n\t" /* a[i+35] += m[35] * mu */ "ldr r7, [%[m], #140]\n\t" "ldr r10, [%[a], #140]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39280,18 +38556,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #140]\n\t" "adc r4, r4, #0\n\t" /* a[i+36] += m[36] * mu */ "ldr r7, [%[m], #144]\n\t" "ldr r10, [%[a], #144]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39315,18 +38585,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #144]\n\t" "adc r5, r5, #0\n\t" /* a[i+37] += m[37] * mu */ "ldr r7, [%[m], #148]\n\t" "ldr r10, [%[a], #148]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39350,18 +38614,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #148]\n\t" "adc r4, r4, #0\n\t" /* a[i+38] += m[38] * mu */ "ldr r7, [%[m], #152]\n\t" "ldr r10, [%[a], #152]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39385,18 +38643,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #152]\n\t" "adc r5, r5, #0\n\t" /* a[i+39] += m[39] * mu */ "ldr r7, [%[m], #156]\n\t" "ldr r10, [%[a], #156]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39420,18 +38672,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #156]\n\t" "adc r4, r4, #0\n\t" /* a[i+40] += m[40] * mu */ "ldr r7, [%[m], #160]\n\t" "ldr r10, [%[a], #160]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39455,18 +38701,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #160]\n\t" "adc r5, r5, #0\n\t" /* a[i+41] += m[41] * mu */ "ldr r7, [%[m], #164]\n\t" "ldr r10, [%[a], #164]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39490,18 +38730,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #164]\n\t" "adc r4, r4, #0\n\t" /* a[i+42] += m[42] * mu */ "ldr r7, [%[m], #168]\n\t" "ldr r10, [%[a], #168]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39525,18 +38759,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #168]\n\t" "adc r5, r5, #0\n\t" /* a[i+43] += m[43] * mu */ "ldr r7, [%[m], #172]\n\t" "ldr r10, [%[a], #172]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39560,18 +38788,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #172]\n\t" "adc r4, r4, #0\n\t" /* a[i+44] += m[44] * mu */ "ldr r7, [%[m], #176]\n\t" "ldr r10, [%[a], #176]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39595,18 +38817,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #176]\n\t" "adc r5, r5, #0\n\t" /* a[i+45] += m[45] * mu */ "ldr r7, [%[m], #180]\n\t" "ldr r10, [%[a], #180]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39630,18 +38846,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #180]\n\t" "adc r4, r4, #0\n\t" /* a[i+46] += m[46] * mu */ "ldr r7, [%[m], #184]\n\t" "ldr r10, [%[a], #184]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39665,18 +38875,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #184]\n\t" "adc r5, r5, #0\n\t" /* a[i+47] += m[47] * mu */ "ldr r7, [%[m], #188]\n\t" "ldr r10, [%[a], #188]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39700,18 +38904,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #188]\n\t" "adc r4, r4, #0\n\t" /* a[i+48] += m[48] * mu */ "ldr r7, [%[m], #192]\n\t" "ldr r10, [%[a], #192]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39735,18 +38933,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #192]\n\t" "adc r5, r5, #0\n\t" /* a[i+49] += m[49] * mu */ "ldr r7, [%[m], #196]\n\t" "ldr r10, [%[a], #196]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39770,18 +38962,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #196]\n\t" "adc r4, r4, #0\n\t" /* a[i+50] += m[50] * mu */ "ldr r7, [%[m], #200]\n\t" "ldr r10, [%[a], #200]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39805,18 +38991,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #200]\n\t" "adc r5, r5, #0\n\t" /* a[i+51] += m[51] * mu */ "ldr r7, [%[m], #204]\n\t" "ldr r10, [%[a], #204]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39840,18 +39020,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #204]\n\t" "adc r4, r4, #0\n\t" /* a[i+52] += m[52] * mu */ "ldr r7, [%[m], #208]\n\t" "ldr r10, [%[a], #208]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39875,18 +39049,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #208]\n\t" "adc r5, r5, #0\n\t" /* a[i+53] += m[53] * mu */ "ldr r7, [%[m], #212]\n\t" "ldr r10, [%[a], #212]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39910,18 +39078,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #212]\n\t" "adc r4, r4, #0\n\t" /* a[i+54] += m[54] * mu */ "ldr r7, [%[m], #216]\n\t" "ldr r10, [%[a], #216]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -39945,18 +39107,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #216]\n\t" "adc r5, r5, #0\n\t" /* a[i+55] += m[55] * mu */ "ldr r7, [%[m], #220]\n\t" "ldr r10, [%[a], #220]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -39980,18 +39136,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #220]\n\t" "adc r4, r4, #0\n\t" /* a[i+56] += m[56] * mu */ "ldr r7, [%[m], #224]\n\t" "ldr r10, [%[a], #224]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40015,18 +39165,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #224]\n\t" "adc r5, r5, #0\n\t" /* a[i+57] += m[57] * mu */ "ldr r7, [%[m], #228]\n\t" "ldr r10, [%[a], #228]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40050,18 +39194,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #228]\n\t" "adc r4, r4, #0\n\t" /* a[i+58] += m[58] * mu */ "ldr r7, [%[m], #232]\n\t" "ldr r10, [%[a], #232]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40085,18 +39223,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #232]\n\t" "adc r5, r5, #0\n\t" /* a[i+59] += m[59] * mu */ "ldr r7, [%[m], #236]\n\t" "ldr r10, [%[a], #236]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40120,18 +39252,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #236]\n\t" "adc r4, r4, #0\n\t" /* a[i+60] += m[60] * mu */ "ldr r7, [%[m], #240]\n\t" "ldr r10, [%[a], #240]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40155,18 +39281,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #240]\n\t" "adc r5, r5, #0\n\t" /* a[i+61] += m[61] * mu */ "ldr r7, [%[m], #244]\n\t" "ldr r10, [%[a], #244]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40190,18 +39310,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #244]\n\t" "adc r4, r4, #0\n\t" /* a[i+62] += m[62] * mu */ "ldr r7, [%[m], #248]\n\t" "ldr r10, [%[a], #248]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40225,18 +39339,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #248]\n\t" "adc r5, r5, #0\n\t" /* a[i+63] += m[63] * mu */ "ldr r7, [%[m], #252]\n\t" "ldr r10, [%[a], #252]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40260,18 +39368,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #252]\n\t" "adc r4, r4, #0\n\t" /* a[i+64] += m[64] * mu */ "ldr r7, [%[m], #256]\n\t" "ldr r10, [%[a], #256]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40295,18 +39397,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #256]\n\t" "adc r5, r5, #0\n\t" /* a[i+65] += m[65] * mu */ "ldr r7, [%[m], #260]\n\t" "ldr r10, [%[a], #260]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40330,18 +39426,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #260]\n\t" "adc r4, r4, #0\n\t" /* a[i+66] += m[66] * mu */ "ldr r7, [%[m], #264]\n\t" "ldr r10, [%[a], #264]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40365,18 +39455,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #264]\n\t" "adc r5, r5, #0\n\t" /* a[i+67] += m[67] * mu */ "ldr r7, [%[m], #268]\n\t" "ldr r10, [%[a], #268]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40400,18 +39484,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #268]\n\t" "adc r4, r4, #0\n\t" /* a[i+68] += m[68] * mu */ "ldr r7, [%[m], #272]\n\t" "ldr r10, [%[a], #272]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40435,18 +39513,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #272]\n\t" "adc r5, r5, #0\n\t" /* a[i+69] += m[69] * mu */ "ldr r7, [%[m], #276]\n\t" "ldr r10, [%[a], #276]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40470,18 +39542,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #276]\n\t" "adc r4, r4, #0\n\t" /* a[i+70] += m[70] * mu */ "ldr r7, [%[m], #280]\n\t" "ldr r10, [%[a], #280]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40505,18 +39571,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #280]\n\t" "adc r5, r5, #0\n\t" /* a[i+71] += m[71] * mu */ "ldr r7, [%[m], #284]\n\t" "ldr r10, [%[a], #284]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40540,18 +39600,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #284]\n\t" "adc r4, r4, #0\n\t" /* a[i+72] += m[72] * mu */ "ldr r7, [%[m], #288]\n\t" "ldr r10, [%[a], #288]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40575,18 +39629,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #288]\n\t" "adc r5, r5, #0\n\t" /* a[i+73] += m[73] * mu */ "ldr r7, [%[m], #292]\n\t" "ldr r10, [%[a], #292]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40610,18 +39658,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #292]\n\t" "adc r4, r4, #0\n\t" /* a[i+74] += m[74] * mu */ "ldr r7, [%[m], #296]\n\t" "ldr r10, [%[a], #296]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40645,18 +39687,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #296]\n\t" "adc r5, r5, #0\n\t" /* a[i+75] += m[75] * mu */ "ldr r7, [%[m], #300]\n\t" "ldr r10, [%[a], #300]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40680,18 +39716,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #300]\n\t" "adc r4, r4, #0\n\t" /* a[i+76] += m[76] * mu */ "ldr r7, [%[m], #304]\n\t" "ldr r10, [%[a], #304]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40715,18 +39745,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #304]\n\t" "adc r5, r5, #0\n\t" /* a[i+77] += m[77] * mu */ "ldr r7, [%[m], #308]\n\t" "ldr r10, [%[a], #308]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40750,18 +39774,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #308]\n\t" "adc r4, r4, #0\n\t" /* a[i+78] += m[78] * mu */ "ldr r7, [%[m], #312]\n\t" "ldr r10, [%[a], #312]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40785,18 +39803,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #312]\n\t" "adc r5, r5, #0\n\t" /* a[i+79] += m[79] * mu */ "ldr r7, [%[m], #316]\n\t" "ldr r10, [%[a], #316]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40820,18 +39832,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #316]\n\t" "adc r4, r4, #0\n\t" /* a[i+80] += m[80] * mu */ "ldr r7, [%[m], #320]\n\t" "ldr r10, [%[a], #320]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40855,18 +39861,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #320]\n\t" "adc r5, r5, #0\n\t" /* a[i+81] += m[81] * mu */ "ldr r7, [%[m], #324]\n\t" "ldr r10, [%[a], #324]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40890,18 +39890,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #324]\n\t" "adc r4, r4, #0\n\t" /* a[i+82] += m[82] * mu */ "ldr r7, [%[m], #328]\n\t" "ldr r10, [%[a], #328]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40925,18 +39919,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #328]\n\t" "adc r5, r5, #0\n\t" /* a[i+83] += m[83] * mu */ "ldr r7, [%[m], #332]\n\t" "ldr r10, [%[a], #332]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -40960,18 +39948,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #332]\n\t" "adc r4, r4, #0\n\t" /* a[i+84] += m[84] * mu */ "ldr r7, [%[m], #336]\n\t" "ldr r10, [%[a], #336]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -40995,18 +39977,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #336]\n\t" "adc r5, r5, #0\n\t" /* a[i+85] += m[85] * mu */ "ldr r7, [%[m], #340]\n\t" "ldr r10, [%[a], #340]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -41030,18 +40006,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #340]\n\t" "adc r4, r4, #0\n\t" /* a[i+86] += m[86] * mu */ "ldr r7, [%[m], #344]\n\t" "ldr r10, [%[a], #344]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -41065,18 +40035,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #344]\n\t" "adc r5, r5, #0\n\t" /* a[i+87] += m[87] * mu */ "ldr r7, [%[m], #348]\n\t" "ldr r10, [%[a], #348]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -41100,18 +40064,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #348]\n\t" "adc r4, r4, #0\n\t" /* a[i+88] += m[88] * mu */ "ldr r7, [%[m], #352]\n\t" "ldr r10, [%[a], #352]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -41135,18 +40093,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #352]\n\t" "adc r5, r5, #0\n\t" /* a[i+89] += m[89] * mu */ "ldr r7, [%[m], #356]\n\t" "ldr r10, [%[a], #356]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -41170,18 +40122,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #356]\n\t" "adc r4, r4, #0\n\t" /* a[i+90] += m[90] * mu */ "ldr r7, [%[m], #360]\n\t" "ldr r10, [%[a], #360]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -41205,18 +40151,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #360]\n\t" "adc r5, r5, #0\n\t" /* a[i+91] += m[91] * mu */ "ldr r7, [%[m], #364]\n\t" "ldr r10, [%[a], #364]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -41240,18 +40180,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #364]\n\t" "adc r4, r4, #0\n\t" /* a[i+92] += m[92] * mu */ "ldr r7, [%[m], #368]\n\t" "ldr r10, [%[a], #368]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -41275,18 +40209,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #368]\n\t" "adc r5, r5, #0\n\t" /* a[i+93] += m[93] * mu */ "ldr r7, [%[m], #372]\n\t" "ldr r10, [%[a], #372]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -41310,18 +40238,12 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #372]\n\t" "adc r4, r4, #0\n\t" /* a[i+94] += m[94] * mu */ "ldr r7, [%[m], #376]\n\t" "ldr r10, [%[a], #376]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -41345,22 +40267,16 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #376]\n\t" "adc r5, r5, #0\n\t" /* a[i+95] += m[95] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #380]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #380]\n\t" +#else + "ldr r7, [%[m], #380]\n\t" #endif "ldr r10, [%[a], #380]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -41391,13 +40307,6 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #380]\n\t" "ldr r10, [%[a], #384]\n\t" @@ -41409,6 +40318,7 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ "add %[a], %[a], #4\n\t" "cmp r9, #0x180\n\t" "blt L_sp_3072_mont_reduce_96_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -41419,6 +40329,1339 @@ static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_ sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_3072_mont_reduce_96_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r7, [%[m], #128]\n\t" + "ldr r10, [%[a], #128]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r7, [%[m], #132]\n\t" + "ldr r10, [%[a], #132]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #132]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r7, [%[m], #136]\n\t" + "ldr r10, [%[a], #136]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #136]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r7, [%[m], #140]\n\t" + "ldr r10, [%[a], #140]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #140]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r7, [%[m], #144]\n\t" + "ldr r10, [%[a], #144]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #144]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r7, [%[m], #148]\n\t" + "ldr r10, [%[a], #148]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #148]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r7, [%[m], #152]\n\t" + "ldr r10, [%[a], #152]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #152]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r7, [%[m], #156]\n\t" + "ldr r10, [%[a], #156]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #156]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r7, [%[m], #160]\n\t" + "ldr r10, [%[a], #160]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #160]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r7, [%[m], #164]\n\t" + "ldr r10, [%[a], #164]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #164]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r7, [%[m], #168]\n\t" + "ldr r10, [%[a], #168]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #168]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r7, [%[m], #172]\n\t" + "ldr r10, [%[a], #172]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #172]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r7, [%[m], #176]\n\t" + "ldr r10, [%[a], #176]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #176]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r7, [%[m], #180]\n\t" + "ldr r10, [%[a], #180]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #180]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r7, [%[m], #184]\n\t" + "ldr r10, [%[a], #184]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #184]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r7, [%[m], #188]\n\t" + "ldr r10, [%[a], #188]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #188]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r7, [%[m], #192]\n\t" + "ldr r10, [%[a], #192]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #192]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r7, [%[m], #196]\n\t" + "ldr r10, [%[a], #196]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #196]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r7, [%[m], #200]\n\t" + "ldr r10, [%[a], #200]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #200]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r7, [%[m], #204]\n\t" + "ldr r10, [%[a], #204]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #204]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r7, [%[m], #208]\n\t" + "ldr r10, [%[a], #208]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #208]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r7, [%[m], #212]\n\t" + "ldr r10, [%[a], #212]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #212]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r7, [%[m], #216]\n\t" + "ldr r10, [%[a], #216]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #216]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r7, [%[m], #220]\n\t" + "ldr r10, [%[a], #220]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #220]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r7, [%[m], #224]\n\t" + "ldr r10, [%[a], #224]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #224]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r7, [%[m], #228]\n\t" + "ldr r10, [%[a], #228]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #228]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r7, [%[m], #232]\n\t" + "ldr r10, [%[a], #232]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #232]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r7, [%[m], #236]\n\t" + "ldr r10, [%[a], #236]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #236]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r7, [%[m], #240]\n\t" + "ldr r10, [%[a], #240]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #240]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r7, [%[m], #244]\n\t" + "ldr r10, [%[a], #244]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #244]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r7, [%[m], #248]\n\t" + "ldr r10, [%[a], #248]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #248]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r7, [%[m], #252]\n\t" + "ldr r10, [%[a], #252]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #252]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+64] += m[64] * mu */ + "ldr r7, [%[m], #256]\n\t" + "ldr r10, [%[a], #256]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #256]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+65] += m[65] * mu */ + "ldr r7, [%[m], #260]\n\t" + "ldr r10, [%[a], #260]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #260]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+66] += m[66] * mu */ + "ldr r7, [%[m], #264]\n\t" + "ldr r10, [%[a], #264]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #264]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+67] += m[67] * mu */ + "ldr r7, [%[m], #268]\n\t" + "ldr r10, [%[a], #268]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #268]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+68] += m[68] * mu */ + "ldr r7, [%[m], #272]\n\t" + "ldr r10, [%[a], #272]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #272]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+69] += m[69] * mu */ + "ldr r7, [%[m], #276]\n\t" + "ldr r10, [%[a], #276]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #276]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+70] += m[70] * mu */ + "ldr r7, [%[m], #280]\n\t" + "ldr r10, [%[a], #280]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #280]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+71] += m[71] * mu */ + "ldr r7, [%[m], #284]\n\t" + "ldr r10, [%[a], #284]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #284]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+72] += m[72] * mu */ + "ldr r7, [%[m], #288]\n\t" + "ldr r10, [%[a], #288]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #288]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+73] += m[73] * mu */ + "ldr r7, [%[m], #292]\n\t" + "ldr r10, [%[a], #292]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #292]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+74] += m[74] * mu */ + "ldr r7, [%[m], #296]\n\t" + "ldr r10, [%[a], #296]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #296]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+75] += m[75] * mu */ + "ldr r7, [%[m], #300]\n\t" + "ldr r10, [%[a], #300]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #300]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+76] += m[76] * mu */ + "ldr r7, [%[m], #304]\n\t" + "ldr r10, [%[a], #304]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #304]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+77] += m[77] * mu */ + "ldr r7, [%[m], #308]\n\t" + "ldr r10, [%[a], #308]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #308]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+78] += m[78] * mu */ + "ldr r7, [%[m], #312]\n\t" + "ldr r10, [%[a], #312]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #312]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+79] += m[79] * mu */ + "ldr r7, [%[m], #316]\n\t" + "ldr r10, [%[a], #316]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #316]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+80] += m[80] * mu */ + "ldr r7, [%[m], #320]\n\t" + "ldr r10, [%[a], #320]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #320]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+81] += m[81] * mu */ + "ldr r7, [%[m], #324]\n\t" + "ldr r10, [%[a], #324]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #324]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+82] += m[82] * mu */ + "ldr r7, [%[m], #328]\n\t" + "ldr r10, [%[a], #328]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #328]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+83] += m[83] * mu */ + "ldr r7, [%[m], #332]\n\t" + "ldr r10, [%[a], #332]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #332]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+84] += m[84] * mu */ + "ldr r7, [%[m], #336]\n\t" + "ldr r10, [%[a], #336]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #336]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+85] += m[85] * mu */ + "ldr r7, [%[m], #340]\n\t" + "ldr r10, [%[a], #340]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #340]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+86] += m[86] * mu */ + "ldr r7, [%[m], #344]\n\t" + "ldr r10, [%[a], #344]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #344]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+87] += m[87] * mu */ + "ldr r7, [%[m], #348]\n\t" + "ldr r10, [%[a], #348]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #348]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+88] += m[88] * mu */ + "ldr r7, [%[m], #352]\n\t" + "ldr r10, [%[a], #352]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #352]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+89] += m[89] * mu */ + "ldr r7, [%[m], #356]\n\t" + "ldr r10, [%[a], #356]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #356]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+90] += m[90] * mu */ + "ldr r7, [%[m], #360]\n\t" + "ldr r10, [%[a], #360]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #360]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+91] += m[91] * mu */ + "ldr r7, [%[m], #364]\n\t" + "ldr r10, [%[a], #364]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #364]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+92] += m[92] * mu */ + "ldr r7, [%[m], #368]\n\t" + "ldr r10, [%[a], #368]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #368]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+93] += m[93] * mu */ + "ldr r7, [%[m], #372]\n\t" + "ldr r10, [%[a], #372]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #372]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+94] += m[94] * mu */ + "ldr r7, [%[m], #376]\n\t" + "ldr r10, [%[a], #376]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #376]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+95] += m[95] * mu */ + "ldr r7, [%[m], #380]\n\t" + "ldr r10, [%[a], #380]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #380]\n\t" + "ldr r10, [%[a], #384]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #384]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x180\n\t" + "blt L_sp_3072_mont_reduce_96_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_3072_mont_reduce_96_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r10, [%[m], #128]\n\t" + "ldr r9, [%[a], #128]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r10, [%[m], #132]\n\t" + "ldr r9, [%[a], #132]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r10, [%[m], #136]\n\t" + "ldr r9, [%[a], #136]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r10, [%[m], #140]\n\t" + "ldr r9, [%[a], #140]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r10, [%[m], #144]\n\t" + "ldr r9, [%[a], #144]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r10, [%[m], #148]\n\t" + "ldr r9, [%[a], #148]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r10, [%[m], #152]\n\t" + "ldr r9, [%[a], #152]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r10, [%[m], #156]\n\t" + "ldr r9, [%[a], #156]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r10, [%[m], #160]\n\t" + "ldr r9, [%[a], #160]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r10, [%[m], #164]\n\t" + "ldr r9, [%[a], #164]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r10, [%[m], #168]\n\t" + "ldr r9, [%[a], #168]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r10, [%[m], #172]\n\t" + "ldr r9, [%[a], #172]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r10, [%[m], #176]\n\t" + "ldr r9, [%[a], #176]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r10, [%[m], #180]\n\t" + "ldr r9, [%[a], #180]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r10, [%[m], #184]\n\t" + "ldr r9, [%[a], #184]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r10, [%[m], #188]\n\t" + "ldr r9, [%[a], #188]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r10, [%[m], #192]\n\t" + "ldr r9, [%[a], #192]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r10, [%[m], #196]\n\t" + "ldr r9, [%[a], #196]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r10, [%[m], #200]\n\t" + "ldr r9, [%[a], #200]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r10, [%[m], #204]\n\t" + "ldr r9, [%[a], #204]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r10, [%[m], #208]\n\t" + "ldr r9, [%[a], #208]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r10, [%[m], #212]\n\t" + "ldr r9, [%[a], #212]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r10, [%[m], #216]\n\t" + "ldr r9, [%[a], #216]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r10, [%[m], #220]\n\t" + "ldr r9, [%[a], #220]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r10, [%[m], #224]\n\t" + "ldr r9, [%[a], #224]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r10, [%[m], #228]\n\t" + "ldr r9, [%[a], #228]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r10, [%[m], #232]\n\t" + "ldr r9, [%[a], #232]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r10, [%[m], #236]\n\t" + "ldr r9, [%[a], #236]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r10, [%[m], #240]\n\t" + "ldr r9, [%[a], #240]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r10, [%[m], #244]\n\t" + "ldr r9, [%[a], #244]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r10, [%[m], #248]\n\t" + "ldr r9, [%[a], #248]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r10, [%[m], #252]\n\t" + "ldr r9, [%[a], #252]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #252]\n\t" + /* a[i+64] += m[64] * mu */ + "ldr r10, [%[m], #256]\n\t" + "ldr r9, [%[a], #256]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #256]\n\t" + /* a[i+65] += m[65] * mu */ + "ldr r10, [%[m], #260]\n\t" + "ldr r9, [%[a], #260]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #260]\n\t" + /* a[i+66] += m[66] * mu */ + "ldr r10, [%[m], #264]\n\t" + "ldr r9, [%[a], #264]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #264]\n\t" + /* a[i+67] += m[67] * mu */ + "ldr r10, [%[m], #268]\n\t" + "ldr r9, [%[a], #268]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #268]\n\t" + /* a[i+68] += m[68] * mu */ + "ldr r10, [%[m], #272]\n\t" + "ldr r9, [%[a], #272]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #272]\n\t" + /* a[i+69] += m[69] * mu */ + "ldr r10, [%[m], #276]\n\t" + "ldr r9, [%[a], #276]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #276]\n\t" + /* a[i+70] += m[70] * mu */ + "ldr r10, [%[m], #280]\n\t" + "ldr r9, [%[a], #280]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #280]\n\t" + /* a[i+71] += m[71] * mu */ + "ldr r10, [%[m], #284]\n\t" + "ldr r9, [%[a], #284]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #284]\n\t" + /* a[i+72] += m[72] * mu */ + "ldr r10, [%[m], #288]\n\t" + "ldr r9, [%[a], #288]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #288]\n\t" + /* a[i+73] += m[73] * mu */ + "ldr r10, [%[m], #292]\n\t" + "ldr r9, [%[a], #292]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #292]\n\t" + /* a[i+74] += m[74] * mu */ + "ldr r10, [%[m], #296]\n\t" + "ldr r9, [%[a], #296]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #296]\n\t" + /* a[i+75] += m[75] * mu */ + "ldr r10, [%[m], #300]\n\t" + "ldr r9, [%[a], #300]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #300]\n\t" + /* a[i+76] += m[76] * mu */ + "ldr r10, [%[m], #304]\n\t" + "ldr r9, [%[a], #304]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #304]\n\t" + /* a[i+77] += m[77] * mu */ + "ldr r10, [%[m], #308]\n\t" + "ldr r9, [%[a], #308]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #308]\n\t" + /* a[i+78] += m[78] * mu */ + "ldr r10, [%[m], #312]\n\t" + "ldr r9, [%[a], #312]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #312]\n\t" + /* a[i+79] += m[79] * mu */ + "ldr r10, [%[m], #316]\n\t" + "ldr r9, [%[a], #316]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #316]\n\t" + /* a[i+80] += m[80] * mu */ + "ldr r10, [%[m], #320]\n\t" + "ldr r9, [%[a], #320]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #320]\n\t" + /* a[i+81] += m[81] * mu */ + "ldr r10, [%[m], #324]\n\t" + "ldr r9, [%[a], #324]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #324]\n\t" + /* a[i+82] += m[82] * mu */ + "ldr r10, [%[m], #328]\n\t" + "ldr r9, [%[a], #328]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #328]\n\t" + /* a[i+83] += m[83] * mu */ + "ldr r10, [%[m], #332]\n\t" + "ldr r9, [%[a], #332]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #332]\n\t" + /* a[i+84] += m[84] * mu */ + "ldr r10, [%[m], #336]\n\t" + "ldr r9, [%[a], #336]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #336]\n\t" + /* a[i+85] += m[85] * mu */ + "ldr r10, [%[m], #340]\n\t" + "ldr r9, [%[a], #340]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #340]\n\t" + /* a[i+86] += m[86] * mu */ + "ldr r10, [%[m], #344]\n\t" + "ldr r9, [%[a], #344]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #344]\n\t" + /* a[i+87] += m[87] * mu */ + "ldr r10, [%[m], #348]\n\t" + "ldr r9, [%[a], #348]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #348]\n\t" + /* a[i+88] += m[88] * mu */ + "ldr r10, [%[m], #352]\n\t" + "ldr r9, [%[a], #352]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #352]\n\t" + /* a[i+89] += m[89] * mu */ + "ldr r10, [%[m], #356]\n\t" + "ldr r9, [%[a], #356]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #356]\n\t" + /* a[i+90] += m[90] * mu */ + "ldr r10, [%[m], #360]\n\t" + "ldr r9, [%[a], #360]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #360]\n\t" + /* a[i+91] += m[91] * mu */ + "ldr r10, [%[m], #364]\n\t" + "ldr r9, [%[a], #364]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #364]\n\t" + /* a[i+92] += m[92] * mu */ + "ldr r10, [%[m], #368]\n\t" + "ldr r9, [%[a], #368]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #368]\n\t" + /* a[i+93] += m[93] * mu */ + "ldr r10, [%[m], #372]\n\t" + "ldr r9, [%[a], #372]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #372]\n\t" + /* a[i+94] += m[94] * mu */ + "ldr r10, [%[m], #376]\n\t" + "ldr r9, [%[a], #376]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #376]\n\t" + /* a[i+95] += m[95] * mu */ + "ldr r10, [%[m], #380]\n\t" + "ldr r9, [%[a], #380]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #384]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #380]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #384]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x180\n\t" + "blt L_sp_3072_mont_reduce_96_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -41458,9 +41701,9 @@ SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, */ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -41495,9 +41738,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -41689,9 +41932,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -41748,9 +41991,9 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -41780,7 +42023,7 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_3072_word_96_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -41808,7 +42051,7 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -41836,7 +42079,7 @@ static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -41987,8 +42230,8 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -41996,7 +42239,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p) "mov r5, #0\n\t" "mov r3, #-1\n\t" #ifdef WOLFSSL_SP_SMALL -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r4, #0x1\n\t" "lsl r4, r4, #8\n\t" "add r4, r4, #0x7c\n\t" @@ -43616,10 +43859,10 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, */ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -43656,10 +43899,10 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r8, #0\n\t" @@ -44156,9 +44399,9 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_FFDHE_3072 static void sp_3072_lshift_96(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #31\n\t" @@ -45042,14 +45285,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -45161,8 +45404,8 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) */ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -45405,12 +45648,11 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -45635,10 +45877,11 @@ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -45728,9 +45971,9 @@ SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) */ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -45766,16 +46009,15 @@ static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x200\n\t" "\n" "L_sp_4096_sub_in_pkace_128_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -45783,13 +46025,13 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_4096_sub_in_pkace_128_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -45804,9 +46046,9 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) */ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" @@ -45824,7 +46066,7 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "L_sp_4096_mul_128_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -45894,12 +46136,11 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* */ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -45908,7 +46149,7 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "L_sp_4096_sqr_128_outer_%=: \n\t" "subs r3, r5, #0x1fc\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_4096_sqr_128_inner_%=: \n\t" @@ -45916,7 +46157,7 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_4096_sqr_128_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -45969,7 +46210,7 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_4096_sqr_128_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -46023,7 +46264,7 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_4096_sqr_128_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -46057,15 +46298,14 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho) */ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -46098,7 +46338,7 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_4096_mul_d_128_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -46143,7 +46383,7 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #512]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -46156,15 +46396,14 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -46189,5051 +46428,11 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[32] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[33] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[34] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[35] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[36] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[37] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[38] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[39] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[40] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[41] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[42] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[43] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[44] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[45] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[46] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[47] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[48] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[49] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[50] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[51] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[52] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[53] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[54] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[55] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[56] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[57] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[58] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[59] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[60] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[61] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[62] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[63] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[64] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[65] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[66] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[67] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[68] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[69] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[70] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[71] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[72] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[73] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[74] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[75] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[76] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[77] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[78] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[79] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[80] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[81] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[82] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[83] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[84] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[85] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[86] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[87] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[88] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[89] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[90] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[91] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[92] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[93] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[94] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[95] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[96] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[97] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[98] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[99] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[100] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[101] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[102] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[103] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[104] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[105] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[106] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[107] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[108] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[109] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[110] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[111] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[112] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[113] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[114] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[115] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[116] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[117] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[118] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[119] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[120] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[121] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[122] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[123] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[124] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[125] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[126] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[127] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -51259,15 +46458,4045 @@ static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[2] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[32] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[33] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[34] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[35] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[36] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[37] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[38] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[39] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[40] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[41] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[42] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[43] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[44] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[45] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[46] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[47] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[48] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[49] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[50] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[51] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[52] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[53] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[54] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[55] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[56] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[57] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[58] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[59] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[60] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[61] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[62] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[63] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[64] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[65] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[66] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[67] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[68] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[69] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[70] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[71] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[72] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[73] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[74] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[75] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[76] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[77] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[78] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[79] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[80] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[81] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[82] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[83] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[84] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[85] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[86] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[87] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[88] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[89] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[90] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[91] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[92] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[93] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[94] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[95] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[96] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[97] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[98] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[99] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[100] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[101] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[102] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[103] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[104] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[105] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[106] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[107] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[108] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[109] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[110] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[111] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[112] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[113] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[114] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[115] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[116] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[117] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[118] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[119] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[120] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[121] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[122] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[123] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[124] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[125] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[126] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[127] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" "str r5, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -51299,10 +50528,10 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) */ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -51339,10 +50568,10 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s */ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -51803,6 +51032,7 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s } #endif /* WOLFSSL_SP_SMALL */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 4096 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -51811,12 +51041,12 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const s */ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -51829,10 +51059,9 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -51856,14 +51085,8 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -51887,18 +51110,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -51922,17 +51139,11 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -51956,18 +51167,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -51991,18 +51196,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52026,18 +51225,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52061,18 +51254,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52096,18 +51283,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52131,18 +51312,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52166,18 +51341,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52201,18 +51370,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52236,18 +51399,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52271,18 +51428,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52306,18 +51457,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52341,18 +51486,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52376,18 +51515,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52411,18 +51544,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52446,18 +51573,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52481,18 +51602,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52516,18 +51631,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52551,18 +51660,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52586,18 +51689,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52621,18 +51718,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52656,18 +51747,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52691,18 +51776,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52726,18 +51805,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52761,18 +51834,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52796,18 +51863,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52831,18 +51892,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52866,18 +51921,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52901,18 +51950,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ "ldr r7, [%[m], #124]\n\t" "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -52936,18 +51979,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "adc r4, r4, #0\n\t" /* a[i+32] += m[32] * mu */ "ldr r7, [%[m], #128]\n\t" "ldr r10, [%[a], #128]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -52971,18 +52008,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #128]\n\t" "adc r5, r5, #0\n\t" /* a[i+33] += m[33] * mu */ "ldr r7, [%[m], #132]\n\t" "ldr r10, [%[a], #132]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53006,18 +52037,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #132]\n\t" "adc r4, r4, #0\n\t" /* a[i+34] += m[34] * mu */ "ldr r7, [%[m], #136]\n\t" "ldr r10, [%[a], #136]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53041,18 +52066,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #136]\n\t" "adc r5, r5, #0\n\t" /* a[i+35] += m[35] * mu */ "ldr r7, [%[m], #140]\n\t" "ldr r10, [%[a], #140]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53076,18 +52095,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #140]\n\t" "adc r4, r4, #0\n\t" /* a[i+36] += m[36] * mu */ "ldr r7, [%[m], #144]\n\t" "ldr r10, [%[a], #144]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53111,18 +52124,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #144]\n\t" "adc r5, r5, #0\n\t" /* a[i+37] += m[37] * mu */ "ldr r7, [%[m], #148]\n\t" "ldr r10, [%[a], #148]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53146,18 +52153,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #148]\n\t" "adc r4, r4, #0\n\t" /* a[i+38] += m[38] * mu */ "ldr r7, [%[m], #152]\n\t" "ldr r10, [%[a], #152]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53181,18 +52182,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #152]\n\t" "adc r5, r5, #0\n\t" /* a[i+39] += m[39] * mu */ "ldr r7, [%[m], #156]\n\t" "ldr r10, [%[a], #156]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53216,18 +52211,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #156]\n\t" "adc r4, r4, #0\n\t" /* a[i+40] += m[40] * mu */ "ldr r7, [%[m], #160]\n\t" "ldr r10, [%[a], #160]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53251,18 +52240,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #160]\n\t" "adc r5, r5, #0\n\t" /* a[i+41] += m[41] * mu */ "ldr r7, [%[m], #164]\n\t" "ldr r10, [%[a], #164]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53286,18 +52269,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #164]\n\t" "adc r4, r4, #0\n\t" /* a[i+42] += m[42] * mu */ "ldr r7, [%[m], #168]\n\t" "ldr r10, [%[a], #168]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53321,18 +52298,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #168]\n\t" "adc r5, r5, #0\n\t" /* a[i+43] += m[43] * mu */ "ldr r7, [%[m], #172]\n\t" "ldr r10, [%[a], #172]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53356,18 +52327,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #172]\n\t" "adc r4, r4, #0\n\t" /* a[i+44] += m[44] * mu */ "ldr r7, [%[m], #176]\n\t" "ldr r10, [%[a], #176]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53391,18 +52356,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #176]\n\t" "adc r5, r5, #0\n\t" /* a[i+45] += m[45] * mu */ "ldr r7, [%[m], #180]\n\t" "ldr r10, [%[a], #180]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53426,18 +52385,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #180]\n\t" "adc r4, r4, #0\n\t" /* a[i+46] += m[46] * mu */ "ldr r7, [%[m], #184]\n\t" "ldr r10, [%[a], #184]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53461,18 +52414,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #184]\n\t" "adc r5, r5, #0\n\t" /* a[i+47] += m[47] * mu */ "ldr r7, [%[m], #188]\n\t" "ldr r10, [%[a], #188]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53496,18 +52443,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #188]\n\t" "adc r4, r4, #0\n\t" /* a[i+48] += m[48] * mu */ "ldr r7, [%[m], #192]\n\t" "ldr r10, [%[a], #192]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53531,18 +52472,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #192]\n\t" "adc r5, r5, #0\n\t" /* a[i+49] += m[49] * mu */ "ldr r7, [%[m], #196]\n\t" "ldr r10, [%[a], #196]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53566,18 +52501,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #196]\n\t" "adc r4, r4, #0\n\t" /* a[i+50] += m[50] * mu */ "ldr r7, [%[m], #200]\n\t" "ldr r10, [%[a], #200]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53601,18 +52530,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #200]\n\t" "adc r5, r5, #0\n\t" /* a[i+51] += m[51] * mu */ "ldr r7, [%[m], #204]\n\t" "ldr r10, [%[a], #204]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53636,18 +52559,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #204]\n\t" "adc r4, r4, #0\n\t" /* a[i+52] += m[52] * mu */ "ldr r7, [%[m], #208]\n\t" "ldr r10, [%[a], #208]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53671,18 +52588,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #208]\n\t" "adc r5, r5, #0\n\t" /* a[i+53] += m[53] * mu */ "ldr r7, [%[m], #212]\n\t" "ldr r10, [%[a], #212]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53706,18 +52617,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #212]\n\t" "adc r4, r4, #0\n\t" /* a[i+54] += m[54] * mu */ "ldr r7, [%[m], #216]\n\t" "ldr r10, [%[a], #216]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53741,18 +52646,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #216]\n\t" "adc r5, r5, #0\n\t" /* a[i+55] += m[55] * mu */ "ldr r7, [%[m], #220]\n\t" "ldr r10, [%[a], #220]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53776,18 +52675,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #220]\n\t" "adc r4, r4, #0\n\t" /* a[i+56] += m[56] * mu */ "ldr r7, [%[m], #224]\n\t" "ldr r10, [%[a], #224]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53811,18 +52704,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #224]\n\t" "adc r5, r5, #0\n\t" /* a[i+57] += m[57] * mu */ "ldr r7, [%[m], #228]\n\t" "ldr r10, [%[a], #228]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53846,18 +52733,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #228]\n\t" "adc r4, r4, #0\n\t" /* a[i+58] += m[58] * mu */ "ldr r7, [%[m], #232]\n\t" "ldr r10, [%[a], #232]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53881,18 +52762,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #232]\n\t" "adc r5, r5, #0\n\t" /* a[i+59] += m[59] * mu */ "ldr r7, [%[m], #236]\n\t" "ldr r10, [%[a], #236]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53916,18 +52791,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #236]\n\t" "adc r4, r4, #0\n\t" /* a[i+60] += m[60] * mu */ "ldr r7, [%[m], #240]\n\t" "ldr r10, [%[a], #240]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -53951,18 +52820,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #240]\n\t" "adc r5, r5, #0\n\t" /* a[i+61] += m[61] * mu */ "ldr r7, [%[m], #244]\n\t" "ldr r10, [%[a], #244]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -53986,18 +52849,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #244]\n\t" "adc r4, r4, #0\n\t" /* a[i+62] += m[62] * mu */ "ldr r7, [%[m], #248]\n\t" "ldr r10, [%[a], #248]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54021,18 +52878,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #248]\n\t" "adc r5, r5, #0\n\t" /* a[i+63] += m[63] * mu */ "ldr r7, [%[m], #252]\n\t" "ldr r10, [%[a], #252]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54056,18 +52907,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #252]\n\t" "adc r4, r4, #0\n\t" /* a[i+64] += m[64] * mu */ "ldr r7, [%[m], #256]\n\t" "ldr r10, [%[a], #256]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54091,18 +52936,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #256]\n\t" "adc r5, r5, #0\n\t" /* a[i+65] += m[65] * mu */ "ldr r7, [%[m], #260]\n\t" "ldr r10, [%[a], #260]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54126,18 +52965,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #260]\n\t" "adc r4, r4, #0\n\t" /* a[i+66] += m[66] * mu */ "ldr r7, [%[m], #264]\n\t" "ldr r10, [%[a], #264]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54161,18 +52994,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #264]\n\t" "adc r5, r5, #0\n\t" /* a[i+67] += m[67] * mu */ "ldr r7, [%[m], #268]\n\t" "ldr r10, [%[a], #268]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54196,18 +53023,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #268]\n\t" "adc r4, r4, #0\n\t" /* a[i+68] += m[68] * mu */ "ldr r7, [%[m], #272]\n\t" "ldr r10, [%[a], #272]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54231,18 +53052,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #272]\n\t" "adc r5, r5, #0\n\t" /* a[i+69] += m[69] * mu */ "ldr r7, [%[m], #276]\n\t" "ldr r10, [%[a], #276]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54266,18 +53081,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #276]\n\t" "adc r4, r4, #0\n\t" /* a[i+70] += m[70] * mu */ "ldr r7, [%[m], #280]\n\t" "ldr r10, [%[a], #280]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54301,18 +53110,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #280]\n\t" "adc r5, r5, #0\n\t" /* a[i+71] += m[71] * mu */ "ldr r7, [%[m], #284]\n\t" "ldr r10, [%[a], #284]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54336,18 +53139,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #284]\n\t" "adc r4, r4, #0\n\t" /* a[i+72] += m[72] * mu */ "ldr r7, [%[m], #288]\n\t" "ldr r10, [%[a], #288]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54371,18 +53168,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #288]\n\t" "adc r5, r5, #0\n\t" /* a[i+73] += m[73] * mu */ "ldr r7, [%[m], #292]\n\t" "ldr r10, [%[a], #292]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54406,18 +53197,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #292]\n\t" "adc r4, r4, #0\n\t" /* a[i+74] += m[74] * mu */ "ldr r7, [%[m], #296]\n\t" "ldr r10, [%[a], #296]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54441,18 +53226,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #296]\n\t" "adc r5, r5, #0\n\t" /* a[i+75] += m[75] * mu */ "ldr r7, [%[m], #300]\n\t" "ldr r10, [%[a], #300]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54476,18 +53255,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #300]\n\t" "adc r4, r4, #0\n\t" /* a[i+76] += m[76] * mu */ "ldr r7, [%[m], #304]\n\t" "ldr r10, [%[a], #304]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54511,18 +53284,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #304]\n\t" "adc r5, r5, #0\n\t" /* a[i+77] += m[77] * mu */ "ldr r7, [%[m], #308]\n\t" "ldr r10, [%[a], #308]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54546,18 +53313,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #308]\n\t" "adc r4, r4, #0\n\t" /* a[i+78] += m[78] * mu */ "ldr r7, [%[m], #312]\n\t" "ldr r10, [%[a], #312]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54581,18 +53342,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #312]\n\t" "adc r5, r5, #0\n\t" /* a[i+79] += m[79] * mu */ "ldr r7, [%[m], #316]\n\t" "ldr r10, [%[a], #316]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54616,18 +53371,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #316]\n\t" "adc r4, r4, #0\n\t" /* a[i+80] += m[80] * mu */ "ldr r7, [%[m], #320]\n\t" "ldr r10, [%[a], #320]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54651,18 +53400,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #320]\n\t" "adc r5, r5, #0\n\t" /* a[i+81] += m[81] * mu */ "ldr r7, [%[m], #324]\n\t" "ldr r10, [%[a], #324]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54686,18 +53429,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #324]\n\t" "adc r4, r4, #0\n\t" /* a[i+82] += m[82] * mu */ "ldr r7, [%[m], #328]\n\t" "ldr r10, [%[a], #328]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54721,18 +53458,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #328]\n\t" "adc r5, r5, #0\n\t" /* a[i+83] += m[83] * mu */ "ldr r7, [%[m], #332]\n\t" "ldr r10, [%[a], #332]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54756,18 +53487,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #332]\n\t" "adc r4, r4, #0\n\t" /* a[i+84] += m[84] * mu */ "ldr r7, [%[m], #336]\n\t" "ldr r10, [%[a], #336]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54791,18 +53516,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #336]\n\t" "adc r5, r5, #0\n\t" /* a[i+85] += m[85] * mu */ "ldr r7, [%[m], #340]\n\t" "ldr r10, [%[a], #340]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54826,18 +53545,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #340]\n\t" "adc r4, r4, #0\n\t" /* a[i+86] += m[86] * mu */ "ldr r7, [%[m], #344]\n\t" "ldr r10, [%[a], #344]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54861,18 +53574,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #344]\n\t" "adc r5, r5, #0\n\t" /* a[i+87] += m[87] * mu */ "ldr r7, [%[m], #348]\n\t" "ldr r10, [%[a], #348]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54896,18 +53603,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #348]\n\t" "adc r4, r4, #0\n\t" /* a[i+88] += m[88] * mu */ "ldr r7, [%[m], #352]\n\t" "ldr r10, [%[a], #352]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -54931,18 +53632,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #352]\n\t" "adc r5, r5, #0\n\t" /* a[i+89] += m[89] * mu */ "ldr r7, [%[m], #356]\n\t" "ldr r10, [%[a], #356]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -54966,18 +53661,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #356]\n\t" "adc r4, r4, #0\n\t" /* a[i+90] += m[90] * mu */ "ldr r7, [%[m], #360]\n\t" "ldr r10, [%[a], #360]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55001,18 +53690,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #360]\n\t" "adc r5, r5, #0\n\t" /* a[i+91] += m[91] * mu */ "ldr r7, [%[m], #364]\n\t" "ldr r10, [%[a], #364]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55036,18 +53719,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #364]\n\t" "adc r4, r4, #0\n\t" /* a[i+92] += m[92] * mu */ "ldr r7, [%[m], #368]\n\t" "ldr r10, [%[a], #368]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55071,18 +53748,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #368]\n\t" "adc r5, r5, #0\n\t" /* a[i+93] += m[93] * mu */ "ldr r7, [%[m], #372]\n\t" "ldr r10, [%[a], #372]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55106,18 +53777,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #372]\n\t" "adc r4, r4, #0\n\t" /* a[i+94] += m[94] * mu */ "ldr r7, [%[m], #376]\n\t" "ldr r10, [%[a], #376]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55141,18 +53806,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #376]\n\t" "adc r5, r5, #0\n\t" /* a[i+95] += m[95] * mu */ "ldr r7, [%[m], #380]\n\t" "ldr r10, [%[a], #380]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55176,18 +53835,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #380]\n\t" "adc r4, r4, #0\n\t" /* a[i+96] += m[96] * mu */ "ldr r7, [%[m], #384]\n\t" "ldr r10, [%[a], #384]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55211,18 +53864,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #384]\n\t" "adc r5, r5, #0\n\t" /* a[i+97] += m[97] * mu */ "ldr r7, [%[m], #388]\n\t" "ldr r10, [%[a], #388]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55246,18 +53893,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #388]\n\t" "adc r4, r4, #0\n\t" /* a[i+98] += m[98] * mu */ "ldr r7, [%[m], #392]\n\t" "ldr r10, [%[a], #392]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55281,18 +53922,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #392]\n\t" "adc r5, r5, #0\n\t" /* a[i+99] += m[99] * mu */ "ldr r7, [%[m], #396]\n\t" "ldr r10, [%[a], #396]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55316,18 +53951,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #396]\n\t" "adc r4, r4, #0\n\t" /* a[i+100] += m[100] * mu */ "ldr r7, [%[m], #400]\n\t" "ldr r10, [%[a], #400]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55351,18 +53980,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #400]\n\t" "adc r5, r5, #0\n\t" /* a[i+101] += m[101] * mu */ "ldr r7, [%[m], #404]\n\t" "ldr r10, [%[a], #404]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55386,18 +54009,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #404]\n\t" "adc r4, r4, #0\n\t" /* a[i+102] += m[102] * mu */ "ldr r7, [%[m], #408]\n\t" "ldr r10, [%[a], #408]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55421,18 +54038,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #408]\n\t" "adc r5, r5, #0\n\t" /* a[i+103] += m[103] * mu */ "ldr r7, [%[m], #412]\n\t" "ldr r10, [%[a], #412]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55456,18 +54067,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #412]\n\t" "adc r4, r4, #0\n\t" /* a[i+104] += m[104] * mu */ "ldr r7, [%[m], #416]\n\t" "ldr r10, [%[a], #416]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55491,18 +54096,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #416]\n\t" "adc r5, r5, #0\n\t" /* a[i+105] += m[105] * mu */ "ldr r7, [%[m], #420]\n\t" "ldr r10, [%[a], #420]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55526,18 +54125,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #420]\n\t" "adc r4, r4, #0\n\t" /* a[i+106] += m[106] * mu */ "ldr r7, [%[m], #424]\n\t" "ldr r10, [%[a], #424]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55561,18 +54154,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #424]\n\t" "adc r5, r5, #0\n\t" /* a[i+107] += m[107] * mu */ "ldr r7, [%[m], #428]\n\t" "ldr r10, [%[a], #428]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55596,18 +54183,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #428]\n\t" "adc r4, r4, #0\n\t" /* a[i+108] += m[108] * mu */ "ldr r7, [%[m], #432]\n\t" "ldr r10, [%[a], #432]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55631,18 +54212,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #432]\n\t" "adc r5, r5, #0\n\t" /* a[i+109] += m[109] * mu */ "ldr r7, [%[m], #436]\n\t" "ldr r10, [%[a], #436]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55666,18 +54241,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #436]\n\t" "adc r4, r4, #0\n\t" /* a[i+110] += m[110] * mu */ "ldr r7, [%[m], #440]\n\t" "ldr r10, [%[a], #440]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55701,18 +54270,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #440]\n\t" "adc r5, r5, #0\n\t" /* a[i+111] += m[111] * mu */ "ldr r7, [%[m], #444]\n\t" "ldr r10, [%[a], #444]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55736,18 +54299,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #444]\n\t" "adc r4, r4, #0\n\t" /* a[i+112] += m[112] * mu */ "ldr r7, [%[m], #448]\n\t" "ldr r10, [%[a], #448]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55771,18 +54328,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #448]\n\t" "adc r5, r5, #0\n\t" /* a[i+113] += m[113] * mu */ "ldr r7, [%[m], #452]\n\t" "ldr r10, [%[a], #452]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55806,18 +54357,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #452]\n\t" "adc r4, r4, #0\n\t" /* a[i+114] += m[114] * mu */ "ldr r7, [%[m], #456]\n\t" "ldr r10, [%[a], #456]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55841,18 +54386,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #456]\n\t" "adc r5, r5, #0\n\t" /* a[i+115] += m[115] * mu */ "ldr r7, [%[m], #460]\n\t" "ldr r10, [%[a], #460]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55876,18 +54415,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #460]\n\t" "adc r4, r4, #0\n\t" /* a[i+116] += m[116] * mu */ "ldr r7, [%[m], #464]\n\t" "ldr r10, [%[a], #464]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55911,18 +54444,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #464]\n\t" "adc r5, r5, #0\n\t" /* a[i+117] += m[117] * mu */ "ldr r7, [%[m], #468]\n\t" "ldr r10, [%[a], #468]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -55946,18 +54473,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #468]\n\t" "adc r4, r4, #0\n\t" /* a[i+118] += m[118] * mu */ "ldr r7, [%[m], #472]\n\t" "ldr r10, [%[a], #472]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -55981,18 +54502,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #472]\n\t" "adc r5, r5, #0\n\t" /* a[i+119] += m[119] * mu */ "ldr r7, [%[m], #476]\n\t" "ldr r10, [%[a], #476]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -56016,18 +54531,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #476]\n\t" "adc r4, r4, #0\n\t" /* a[i+120] += m[120] * mu */ "ldr r7, [%[m], #480]\n\t" "ldr r10, [%[a], #480]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -56051,18 +54560,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #480]\n\t" "adc r5, r5, #0\n\t" /* a[i+121] += m[121] * mu */ "ldr r7, [%[m], #484]\n\t" "ldr r10, [%[a], #484]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -56086,18 +54589,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #484]\n\t" "adc r4, r4, #0\n\t" /* a[i+122] += m[122] * mu */ "ldr r7, [%[m], #488]\n\t" "ldr r10, [%[a], #488]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -56121,18 +54618,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #488]\n\t" "adc r5, r5, #0\n\t" /* a[i+123] += m[123] * mu */ "ldr r7, [%[m], #492]\n\t" "ldr r10, [%[a], #492]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -56156,18 +54647,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #492]\n\t" "adc r4, r4, #0\n\t" /* a[i+124] += m[124] * mu */ "ldr r7, [%[m], #496]\n\t" "ldr r10, [%[a], #496]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -56191,18 +54676,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #496]\n\t" "adc r5, r5, #0\n\t" /* a[i+125] += m[125] * mu */ "ldr r7, [%[m], #500]\n\t" "ldr r10, [%[a], #500]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -56226,18 +54705,12 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #500]\n\t" "adc r4, r4, #0\n\t" /* a[i+126] += m[126] * mu */ "ldr r7, [%[m], #504]\n\t" "ldr r10, [%[a], #504]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -56261,22 +54734,16 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #504]\n\t" "adc r5, r5, #0\n\t" /* a[i+127] += m[127] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #508]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #508]\n\t" +#else + "ldr r7, [%[m], #508]\n\t" #endif "ldr r10, [%[a], #508]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -56307,13 +54774,6 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #508]\n\t" "ldr r10, [%[a], #512]\n\t" @@ -56325,6 +54785,7 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m "add %[a], %[a], #4\n\t" "cmp r9, #0x200\n\t" "blt L_sp_4096_mont_reduce_128_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -56335,6 +54796,1755 @@ static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 4096 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_4096_mont_reduce_128_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r7, [%[m], #128]\n\t" + "ldr r10, [%[a], #128]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r7, [%[m], #132]\n\t" + "ldr r10, [%[a], #132]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #132]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r7, [%[m], #136]\n\t" + "ldr r10, [%[a], #136]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #136]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r7, [%[m], #140]\n\t" + "ldr r10, [%[a], #140]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #140]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r7, [%[m], #144]\n\t" + "ldr r10, [%[a], #144]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #144]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r7, [%[m], #148]\n\t" + "ldr r10, [%[a], #148]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #148]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r7, [%[m], #152]\n\t" + "ldr r10, [%[a], #152]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #152]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r7, [%[m], #156]\n\t" + "ldr r10, [%[a], #156]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #156]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r7, [%[m], #160]\n\t" + "ldr r10, [%[a], #160]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #160]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r7, [%[m], #164]\n\t" + "ldr r10, [%[a], #164]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #164]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r7, [%[m], #168]\n\t" + "ldr r10, [%[a], #168]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #168]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r7, [%[m], #172]\n\t" + "ldr r10, [%[a], #172]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #172]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r7, [%[m], #176]\n\t" + "ldr r10, [%[a], #176]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #176]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r7, [%[m], #180]\n\t" + "ldr r10, [%[a], #180]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #180]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r7, [%[m], #184]\n\t" + "ldr r10, [%[a], #184]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #184]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r7, [%[m], #188]\n\t" + "ldr r10, [%[a], #188]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #188]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r7, [%[m], #192]\n\t" + "ldr r10, [%[a], #192]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #192]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r7, [%[m], #196]\n\t" + "ldr r10, [%[a], #196]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #196]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r7, [%[m], #200]\n\t" + "ldr r10, [%[a], #200]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #200]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r7, [%[m], #204]\n\t" + "ldr r10, [%[a], #204]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #204]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r7, [%[m], #208]\n\t" + "ldr r10, [%[a], #208]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #208]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r7, [%[m], #212]\n\t" + "ldr r10, [%[a], #212]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #212]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r7, [%[m], #216]\n\t" + "ldr r10, [%[a], #216]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #216]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r7, [%[m], #220]\n\t" + "ldr r10, [%[a], #220]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #220]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r7, [%[m], #224]\n\t" + "ldr r10, [%[a], #224]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #224]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r7, [%[m], #228]\n\t" + "ldr r10, [%[a], #228]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #228]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r7, [%[m], #232]\n\t" + "ldr r10, [%[a], #232]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #232]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r7, [%[m], #236]\n\t" + "ldr r10, [%[a], #236]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #236]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r7, [%[m], #240]\n\t" + "ldr r10, [%[a], #240]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #240]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r7, [%[m], #244]\n\t" + "ldr r10, [%[a], #244]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #244]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r7, [%[m], #248]\n\t" + "ldr r10, [%[a], #248]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #248]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r7, [%[m], #252]\n\t" + "ldr r10, [%[a], #252]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #252]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+64] += m[64] * mu */ + "ldr r7, [%[m], #256]\n\t" + "ldr r10, [%[a], #256]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #256]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+65] += m[65] * mu */ + "ldr r7, [%[m], #260]\n\t" + "ldr r10, [%[a], #260]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #260]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+66] += m[66] * mu */ + "ldr r7, [%[m], #264]\n\t" + "ldr r10, [%[a], #264]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #264]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+67] += m[67] * mu */ + "ldr r7, [%[m], #268]\n\t" + "ldr r10, [%[a], #268]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #268]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+68] += m[68] * mu */ + "ldr r7, [%[m], #272]\n\t" + "ldr r10, [%[a], #272]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #272]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+69] += m[69] * mu */ + "ldr r7, [%[m], #276]\n\t" + "ldr r10, [%[a], #276]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #276]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+70] += m[70] * mu */ + "ldr r7, [%[m], #280]\n\t" + "ldr r10, [%[a], #280]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #280]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+71] += m[71] * mu */ + "ldr r7, [%[m], #284]\n\t" + "ldr r10, [%[a], #284]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #284]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+72] += m[72] * mu */ + "ldr r7, [%[m], #288]\n\t" + "ldr r10, [%[a], #288]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #288]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+73] += m[73] * mu */ + "ldr r7, [%[m], #292]\n\t" + "ldr r10, [%[a], #292]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #292]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+74] += m[74] * mu */ + "ldr r7, [%[m], #296]\n\t" + "ldr r10, [%[a], #296]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #296]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+75] += m[75] * mu */ + "ldr r7, [%[m], #300]\n\t" + "ldr r10, [%[a], #300]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #300]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+76] += m[76] * mu */ + "ldr r7, [%[m], #304]\n\t" + "ldr r10, [%[a], #304]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #304]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+77] += m[77] * mu */ + "ldr r7, [%[m], #308]\n\t" + "ldr r10, [%[a], #308]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #308]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+78] += m[78] * mu */ + "ldr r7, [%[m], #312]\n\t" + "ldr r10, [%[a], #312]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #312]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+79] += m[79] * mu */ + "ldr r7, [%[m], #316]\n\t" + "ldr r10, [%[a], #316]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #316]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+80] += m[80] * mu */ + "ldr r7, [%[m], #320]\n\t" + "ldr r10, [%[a], #320]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #320]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+81] += m[81] * mu */ + "ldr r7, [%[m], #324]\n\t" + "ldr r10, [%[a], #324]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #324]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+82] += m[82] * mu */ + "ldr r7, [%[m], #328]\n\t" + "ldr r10, [%[a], #328]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #328]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+83] += m[83] * mu */ + "ldr r7, [%[m], #332]\n\t" + "ldr r10, [%[a], #332]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #332]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+84] += m[84] * mu */ + "ldr r7, [%[m], #336]\n\t" + "ldr r10, [%[a], #336]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #336]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+85] += m[85] * mu */ + "ldr r7, [%[m], #340]\n\t" + "ldr r10, [%[a], #340]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #340]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+86] += m[86] * mu */ + "ldr r7, [%[m], #344]\n\t" + "ldr r10, [%[a], #344]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #344]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+87] += m[87] * mu */ + "ldr r7, [%[m], #348]\n\t" + "ldr r10, [%[a], #348]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #348]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+88] += m[88] * mu */ + "ldr r7, [%[m], #352]\n\t" + "ldr r10, [%[a], #352]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #352]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+89] += m[89] * mu */ + "ldr r7, [%[m], #356]\n\t" + "ldr r10, [%[a], #356]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #356]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+90] += m[90] * mu */ + "ldr r7, [%[m], #360]\n\t" + "ldr r10, [%[a], #360]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #360]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+91] += m[91] * mu */ + "ldr r7, [%[m], #364]\n\t" + "ldr r10, [%[a], #364]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #364]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+92] += m[92] * mu */ + "ldr r7, [%[m], #368]\n\t" + "ldr r10, [%[a], #368]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #368]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+93] += m[93] * mu */ + "ldr r7, [%[m], #372]\n\t" + "ldr r10, [%[a], #372]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #372]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+94] += m[94] * mu */ + "ldr r7, [%[m], #376]\n\t" + "ldr r10, [%[a], #376]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #376]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+95] += m[95] * mu */ + "ldr r7, [%[m], #380]\n\t" + "ldr r10, [%[a], #380]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #380]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+96] += m[96] * mu */ + "ldr r7, [%[m], #384]\n\t" + "ldr r10, [%[a], #384]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #384]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+97] += m[97] * mu */ + "ldr r7, [%[m], #388]\n\t" + "ldr r10, [%[a], #388]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #388]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+98] += m[98] * mu */ + "ldr r7, [%[m], #392]\n\t" + "ldr r10, [%[a], #392]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #392]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+99] += m[99] * mu */ + "ldr r7, [%[m], #396]\n\t" + "ldr r10, [%[a], #396]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #396]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+100] += m[100] * mu */ + "ldr r7, [%[m], #400]\n\t" + "ldr r10, [%[a], #400]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #400]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+101] += m[101] * mu */ + "ldr r7, [%[m], #404]\n\t" + "ldr r10, [%[a], #404]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #404]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+102] += m[102] * mu */ + "ldr r7, [%[m], #408]\n\t" + "ldr r10, [%[a], #408]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #408]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+103] += m[103] * mu */ + "ldr r7, [%[m], #412]\n\t" + "ldr r10, [%[a], #412]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #412]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+104] += m[104] * mu */ + "ldr r7, [%[m], #416]\n\t" + "ldr r10, [%[a], #416]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #416]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+105] += m[105] * mu */ + "ldr r7, [%[m], #420]\n\t" + "ldr r10, [%[a], #420]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #420]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+106] += m[106] * mu */ + "ldr r7, [%[m], #424]\n\t" + "ldr r10, [%[a], #424]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #424]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+107] += m[107] * mu */ + "ldr r7, [%[m], #428]\n\t" + "ldr r10, [%[a], #428]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #428]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+108] += m[108] * mu */ + "ldr r7, [%[m], #432]\n\t" + "ldr r10, [%[a], #432]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #432]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+109] += m[109] * mu */ + "ldr r7, [%[m], #436]\n\t" + "ldr r10, [%[a], #436]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #436]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+110] += m[110] * mu */ + "ldr r7, [%[m], #440]\n\t" + "ldr r10, [%[a], #440]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #440]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+111] += m[111] * mu */ + "ldr r7, [%[m], #444]\n\t" + "ldr r10, [%[a], #444]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #444]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+112] += m[112] * mu */ + "ldr r7, [%[m], #448]\n\t" + "ldr r10, [%[a], #448]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #448]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+113] += m[113] * mu */ + "ldr r7, [%[m], #452]\n\t" + "ldr r10, [%[a], #452]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #452]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+114] += m[114] * mu */ + "ldr r7, [%[m], #456]\n\t" + "ldr r10, [%[a], #456]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #456]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+115] += m[115] * mu */ + "ldr r7, [%[m], #460]\n\t" + "ldr r10, [%[a], #460]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #460]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+116] += m[116] * mu */ + "ldr r7, [%[m], #464]\n\t" + "ldr r10, [%[a], #464]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #464]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+117] += m[117] * mu */ + "ldr r7, [%[m], #468]\n\t" + "ldr r10, [%[a], #468]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #468]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+118] += m[118] * mu */ + "ldr r7, [%[m], #472]\n\t" + "ldr r10, [%[a], #472]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #472]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+119] += m[119] * mu */ + "ldr r7, [%[m], #476]\n\t" + "ldr r10, [%[a], #476]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #476]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+120] += m[120] * mu */ + "ldr r7, [%[m], #480]\n\t" + "ldr r10, [%[a], #480]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #480]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+121] += m[121] * mu */ + "ldr r7, [%[m], #484]\n\t" + "ldr r10, [%[a], #484]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #484]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+122] += m[122] * mu */ + "ldr r7, [%[m], #488]\n\t" + "ldr r10, [%[a], #488]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #488]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+123] += m[123] * mu */ + "ldr r7, [%[m], #492]\n\t" + "ldr r10, [%[a], #492]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #492]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+124] += m[124] * mu */ + "ldr r7, [%[m], #496]\n\t" + "ldr r10, [%[a], #496]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #496]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+125] += m[125] * mu */ + "ldr r7, [%[m], #500]\n\t" + "ldr r10, [%[a], #500]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #500]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+126] += m[126] * mu */ + "ldr r7, [%[m], #504]\n\t" + "ldr r10, [%[a], #504]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #504]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+127] += m[127] * mu */ + "ldr r7, [%[m], #508]\n\t" + "ldr r10, [%[a], #508]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #508]\n\t" + "ldr r10, [%[a], #512]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #512]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x200\n\t" + "blt L_sp_4096_mont_reduce_128_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 4096 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_4096_mont_reduce_128_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "ldr r10, [%[m], #128]\n\t" + "ldr r9, [%[a], #128]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "ldr r10, [%[m], #132]\n\t" + "ldr r9, [%[a], #132]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "ldr r10, [%[m], #136]\n\t" + "ldr r9, [%[a], #136]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "ldr r10, [%[m], #140]\n\t" + "ldr r9, [%[a], #140]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "ldr r10, [%[m], #144]\n\t" + "ldr r9, [%[a], #144]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "ldr r10, [%[m], #148]\n\t" + "ldr r9, [%[a], #148]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "ldr r10, [%[m], #152]\n\t" + "ldr r9, [%[a], #152]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "ldr r10, [%[m], #156]\n\t" + "ldr r9, [%[a], #156]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "ldr r10, [%[m], #160]\n\t" + "ldr r9, [%[a], #160]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "ldr r10, [%[m], #164]\n\t" + "ldr r9, [%[a], #164]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "ldr r10, [%[m], #168]\n\t" + "ldr r9, [%[a], #168]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "ldr r10, [%[m], #172]\n\t" + "ldr r9, [%[a], #172]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "ldr r10, [%[m], #176]\n\t" + "ldr r9, [%[a], #176]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "ldr r10, [%[m], #180]\n\t" + "ldr r9, [%[a], #180]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "ldr r10, [%[m], #184]\n\t" + "ldr r9, [%[a], #184]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "ldr r10, [%[m], #188]\n\t" + "ldr r9, [%[a], #188]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "ldr r10, [%[m], #192]\n\t" + "ldr r9, [%[a], #192]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "ldr r10, [%[m], #196]\n\t" + "ldr r9, [%[a], #196]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "ldr r10, [%[m], #200]\n\t" + "ldr r9, [%[a], #200]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "ldr r10, [%[m], #204]\n\t" + "ldr r9, [%[a], #204]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "ldr r10, [%[m], #208]\n\t" + "ldr r9, [%[a], #208]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "ldr r10, [%[m], #212]\n\t" + "ldr r9, [%[a], #212]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "ldr r10, [%[m], #216]\n\t" + "ldr r9, [%[a], #216]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "ldr r10, [%[m], #220]\n\t" + "ldr r9, [%[a], #220]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "ldr r10, [%[m], #224]\n\t" + "ldr r9, [%[a], #224]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "ldr r10, [%[m], #228]\n\t" + "ldr r9, [%[a], #228]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "ldr r10, [%[m], #232]\n\t" + "ldr r9, [%[a], #232]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "ldr r10, [%[m], #236]\n\t" + "ldr r9, [%[a], #236]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "ldr r10, [%[m], #240]\n\t" + "ldr r9, [%[a], #240]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "ldr r10, [%[m], #244]\n\t" + "ldr r9, [%[a], #244]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "ldr r10, [%[m], #248]\n\t" + "ldr r9, [%[a], #248]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "ldr r10, [%[m], #252]\n\t" + "ldr r9, [%[a], #252]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #252]\n\t" + /* a[i+64] += m[64] * mu */ + "ldr r10, [%[m], #256]\n\t" + "ldr r9, [%[a], #256]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #256]\n\t" + /* a[i+65] += m[65] * mu */ + "ldr r10, [%[m], #260]\n\t" + "ldr r9, [%[a], #260]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #260]\n\t" + /* a[i+66] += m[66] * mu */ + "ldr r10, [%[m], #264]\n\t" + "ldr r9, [%[a], #264]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #264]\n\t" + /* a[i+67] += m[67] * mu */ + "ldr r10, [%[m], #268]\n\t" + "ldr r9, [%[a], #268]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #268]\n\t" + /* a[i+68] += m[68] * mu */ + "ldr r10, [%[m], #272]\n\t" + "ldr r9, [%[a], #272]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #272]\n\t" + /* a[i+69] += m[69] * mu */ + "ldr r10, [%[m], #276]\n\t" + "ldr r9, [%[a], #276]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #276]\n\t" + /* a[i+70] += m[70] * mu */ + "ldr r10, [%[m], #280]\n\t" + "ldr r9, [%[a], #280]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #280]\n\t" + /* a[i+71] += m[71] * mu */ + "ldr r10, [%[m], #284]\n\t" + "ldr r9, [%[a], #284]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #284]\n\t" + /* a[i+72] += m[72] * mu */ + "ldr r10, [%[m], #288]\n\t" + "ldr r9, [%[a], #288]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #288]\n\t" + /* a[i+73] += m[73] * mu */ + "ldr r10, [%[m], #292]\n\t" + "ldr r9, [%[a], #292]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #292]\n\t" + /* a[i+74] += m[74] * mu */ + "ldr r10, [%[m], #296]\n\t" + "ldr r9, [%[a], #296]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #296]\n\t" + /* a[i+75] += m[75] * mu */ + "ldr r10, [%[m], #300]\n\t" + "ldr r9, [%[a], #300]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #300]\n\t" + /* a[i+76] += m[76] * mu */ + "ldr r10, [%[m], #304]\n\t" + "ldr r9, [%[a], #304]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #304]\n\t" + /* a[i+77] += m[77] * mu */ + "ldr r10, [%[m], #308]\n\t" + "ldr r9, [%[a], #308]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #308]\n\t" + /* a[i+78] += m[78] * mu */ + "ldr r10, [%[m], #312]\n\t" + "ldr r9, [%[a], #312]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #312]\n\t" + /* a[i+79] += m[79] * mu */ + "ldr r10, [%[m], #316]\n\t" + "ldr r9, [%[a], #316]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #316]\n\t" + /* a[i+80] += m[80] * mu */ + "ldr r10, [%[m], #320]\n\t" + "ldr r9, [%[a], #320]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #320]\n\t" + /* a[i+81] += m[81] * mu */ + "ldr r10, [%[m], #324]\n\t" + "ldr r9, [%[a], #324]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #324]\n\t" + /* a[i+82] += m[82] * mu */ + "ldr r10, [%[m], #328]\n\t" + "ldr r9, [%[a], #328]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #328]\n\t" + /* a[i+83] += m[83] * mu */ + "ldr r10, [%[m], #332]\n\t" + "ldr r9, [%[a], #332]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #332]\n\t" + /* a[i+84] += m[84] * mu */ + "ldr r10, [%[m], #336]\n\t" + "ldr r9, [%[a], #336]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #336]\n\t" + /* a[i+85] += m[85] * mu */ + "ldr r10, [%[m], #340]\n\t" + "ldr r9, [%[a], #340]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #340]\n\t" + /* a[i+86] += m[86] * mu */ + "ldr r10, [%[m], #344]\n\t" + "ldr r9, [%[a], #344]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #344]\n\t" + /* a[i+87] += m[87] * mu */ + "ldr r10, [%[m], #348]\n\t" + "ldr r9, [%[a], #348]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #348]\n\t" + /* a[i+88] += m[88] * mu */ + "ldr r10, [%[m], #352]\n\t" + "ldr r9, [%[a], #352]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #352]\n\t" + /* a[i+89] += m[89] * mu */ + "ldr r10, [%[m], #356]\n\t" + "ldr r9, [%[a], #356]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #356]\n\t" + /* a[i+90] += m[90] * mu */ + "ldr r10, [%[m], #360]\n\t" + "ldr r9, [%[a], #360]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #360]\n\t" + /* a[i+91] += m[91] * mu */ + "ldr r10, [%[m], #364]\n\t" + "ldr r9, [%[a], #364]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #364]\n\t" + /* a[i+92] += m[92] * mu */ + "ldr r10, [%[m], #368]\n\t" + "ldr r9, [%[a], #368]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #368]\n\t" + /* a[i+93] += m[93] * mu */ + "ldr r10, [%[m], #372]\n\t" + "ldr r9, [%[a], #372]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #372]\n\t" + /* a[i+94] += m[94] * mu */ + "ldr r10, [%[m], #376]\n\t" + "ldr r9, [%[a], #376]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #376]\n\t" + /* a[i+95] += m[95] * mu */ + "ldr r10, [%[m], #380]\n\t" + "ldr r9, [%[a], #380]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #380]\n\t" + /* a[i+96] += m[96] * mu */ + "ldr r10, [%[m], #384]\n\t" + "ldr r9, [%[a], #384]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #384]\n\t" + /* a[i+97] += m[97] * mu */ + "ldr r10, [%[m], #388]\n\t" + "ldr r9, [%[a], #388]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #388]\n\t" + /* a[i+98] += m[98] * mu */ + "ldr r10, [%[m], #392]\n\t" + "ldr r9, [%[a], #392]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #392]\n\t" + /* a[i+99] += m[99] * mu */ + "ldr r10, [%[m], #396]\n\t" + "ldr r9, [%[a], #396]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #396]\n\t" + /* a[i+100] += m[100] * mu */ + "ldr r10, [%[m], #400]\n\t" + "ldr r9, [%[a], #400]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #400]\n\t" + /* a[i+101] += m[101] * mu */ + "ldr r10, [%[m], #404]\n\t" + "ldr r9, [%[a], #404]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #404]\n\t" + /* a[i+102] += m[102] * mu */ + "ldr r10, [%[m], #408]\n\t" + "ldr r9, [%[a], #408]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #408]\n\t" + /* a[i+103] += m[103] * mu */ + "ldr r10, [%[m], #412]\n\t" + "ldr r9, [%[a], #412]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #412]\n\t" + /* a[i+104] += m[104] * mu */ + "ldr r10, [%[m], #416]\n\t" + "ldr r9, [%[a], #416]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #416]\n\t" + /* a[i+105] += m[105] * mu */ + "ldr r10, [%[m], #420]\n\t" + "ldr r9, [%[a], #420]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #420]\n\t" + /* a[i+106] += m[106] * mu */ + "ldr r10, [%[m], #424]\n\t" + "ldr r9, [%[a], #424]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #424]\n\t" + /* a[i+107] += m[107] * mu */ + "ldr r10, [%[m], #428]\n\t" + "ldr r9, [%[a], #428]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #428]\n\t" + /* a[i+108] += m[108] * mu */ + "ldr r10, [%[m], #432]\n\t" + "ldr r9, [%[a], #432]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #432]\n\t" + /* a[i+109] += m[109] * mu */ + "ldr r10, [%[m], #436]\n\t" + "ldr r9, [%[a], #436]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #436]\n\t" + /* a[i+110] += m[110] * mu */ + "ldr r10, [%[m], #440]\n\t" + "ldr r9, [%[a], #440]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #440]\n\t" + /* a[i+111] += m[111] * mu */ + "ldr r10, [%[m], #444]\n\t" + "ldr r9, [%[a], #444]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #444]\n\t" + /* a[i+112] += m[112] * mu */ + "ldr r10, [%[m], #448]\n\t" + "ldr r9, [%[a], #448]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #448]\n\t" + /* a[i+113] += m[113] * mu */ + "ldr r10, [%[m], #452]\n\t" + "ldr r9, [%[a], #452]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #452]\n\t" + /* a[i+114] += m[114] * mu */ + "ldr r10, [%[m], #456]\n\t" + "ldr r9, [%[a], #456]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #456]\n\t" + /* a[i+115] += m[115] * mu */ + "ldr r10, [%[m], #460]\n\t" + "ldr r9, [%[a], #460]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #460]\n\t" + /* a[i+116] += m[116] * mu */ + "ldr r10, [%[m], #464]\n\t" + "ldr r9, [%[a], #464]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #464]\n\t" + /* a[i+117] += m[117] * mu */ + "ldr r10, [%[m], #468]\n\t" + "ldr r9, [%[a], #468]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #468]\n\t" + /* a[i+118] += m[118] * mu */ + "ldr r10, [%[m], #472]\n\t" + "ldr r9, [%[a], #472]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #472]\n\t" + /* a[i+119] += m[119] * mu */ + "ldr r10, [%[m], #476]\n\t" + "ldr r9, [%[a], #476]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #476]\n\t" + /* a[i+120] += m[120] * mu */ + "ldr r10, [%[m], #480]\n\t" + "ldr r9, [%[a], #480]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #480]\n\t" + /* a[i+121] += m[121] * mu */ + "ldr r10, [%[m], #484]\n\t" + "ldr r9, [%[a], #484]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #484]\n\t" + /* a[i+122] += m[122] * mu */ + "ldr r10, [%[m], #488]\n\t" + "ldr r9, [%[a], #488]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #488]\n\t" + /* a[i+123] += m[123] * mu */ + "ldr r10, [%[m], #492]\n\t" + "ldr r9, [%[a], #492]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #492]\n\t" + /* a[i+124] += m[124] * mu */ + "ldr r10, [%[m], #496]\n\t" + "ldr r9, [%[a], #496]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #496]\n\t" + /* a[i+125] += m[125] * mu */ + "ldr r10, [%[m], #500]\n\t" + "ldr r9, [%[a], #500]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #500]\n\t" + /* a[i+126] += m[126] * mu */ + "ldr r10, [%[m], #504]\n\t" + "ldr r9, [%[a], #504]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #504]\n\t" + /* a[i+127] += m[127] * mu */ + "ldr r10, [%[m], #508]\n\t" + "ldr r9, [%[a], #508]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #512]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #508]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #512]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x200\n\t" + "blt L_sp_4096_mont_reduce_128_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -56374,9 +56584,9 @@ SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, */ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -56411,9 +56621,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -56661,9 +56871,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -56720,9 +56930,9 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -56752,7 +56962,7 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_4096_word_128_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -56780,7 +56990,7 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -56808,7 +57018,7 @@ static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -56959,8 +57169,8 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -56968,7 +57178,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p) "mov r5, #0\n\t" "mov r3, #-1\n\t" #ifdef WOLFSSL_SP_SMALL -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r4, #0x1\n\t" "lsl r4, r4, #8\n\t" "add r4, r4, #0xfc\n\t" @@ -58940,10 +59150,10 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, */ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -58980,10 +59190,10 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r8, #0\n\t" @@ -59536,9 +59746,9 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef HAVE_FFDHE_4096 static void sp_4096_lshift_128(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #31\n\t" @@ -60607,9 +60817,9 @@ static const sp_digit p256_b[8] = { */ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" @@ -60627,7 +60837,7 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "L_sp_256_mul_8_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -60691,6 +60901,7 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p } #else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Multiply a and b into r. (r = a * b) * * r A single precision integer. @@ -60699,17 +60910,15 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p */ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #32\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60732,14 +60941,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adc r4, r4, r7\n\t" "mov r5, #0\n\t" -#else - "umull r3, r4, r11, r12\n\t" - "mov r5, #0\n\t" -#endif "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60769,16 +60973,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60807,16 +61003,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60846,17 +61035,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60885,16 +61066,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60923,16 +61097,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -60962,16 +61129,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61000,15 +61159,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61037,16 +61189,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61075,16 +61220,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61114,16 +61252,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61152,16 +61282,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61190,16 +61313,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61228,16 +61344,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61266,16 +61375,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61305,17 +61407,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61344,15 +61438,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61381,15 +61468,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61418,16 +61498,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61456,16 +61529,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61494,16 +61560,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61533,17 +61592,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61572,15 +61623,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61609,16 +61653,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61647,16 +61684,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61685,16 +61715,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61723,16 +61746,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61761,16 +61777,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61800,17 +61809,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61839,16 +61840,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61877,15 +61871,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61914,15 +61901,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61951,16 +61931,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -61989,16 +61962,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62027,16 +61993,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62065,16 +62024,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #28]\n\t" /* A[7] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62104,17 +62056,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62143,15 +62087,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62180,16 +62117,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62218,16 +62148,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62256,16 +62179,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62294,16 +62210,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62332,16 +62241,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [%[r], #32]\n\t" /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62371,17 +62273,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62410,15 +62304,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62447,15 +62334,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62484,16 +62364,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62522,16 +62395,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62560,16 +62426,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [%[r], #36]\n\t" /* A[7] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62599,16 +62458,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62637,16 +62488,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62675,16 +62519,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62713,16 +62550,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62751,16 +62581,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #40]\n\t" /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62790,16 +62613,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62828,15 +62643,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62865,16 +62673,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62903,15 +62704,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r5, r5, r6\n\t" "adcs r3, r3, r7\n\t" "adc r4, r4, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#endif "str r5, [%[r], #44]\n\t" /* A[7] * B[5] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62941,17 +62735,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -62980,16 +62766,9 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r11, r12\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -63018,15 +62797,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r3, r3, r6\n\t" "adcs r4, r4, r7\n\t" "adc r5, r5, #0\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#endif "str r3, [%[r], #48]\n\t" /* A[6] * B[7] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -63056,16 +62828,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r11, r9\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -63094,15 +62858,8 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r12\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #52]\n\t" /* A[7] * B[7] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -63127,11 +62884,6 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" -#else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r7\n\t" -#endif "str r5, [%[r], #56]\n\t" "str r3, [%[r], #60]\n\t" "ldm sp!, {r3, r4, r5, r6}\n\t" @@ -63140,10 +62892,495 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "stm %[r]!, {r3, r4, r5, r6}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #36\n\t" + "str %[r], [sp, #32]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "ldr lr, [%[b]]\n\t" + "umull r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "umull r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "umull r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "umull r9, r10, r12, lr\n\t" + "str r3, [sp]\n\t" + /* A[0] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "mov r11, %[r]\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[0] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adcs r6, r6, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[0] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adcs r8, r8, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[0] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adcs r10, r10, #0\n\t" + "adc r3, %[r], #0\n\t" + "umlal r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "str r4, [sp, #4]\n\t" + "adds r5, r5, r11\n\t" + /* A[1] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[1] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "str r5, [sp, #8]\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[2] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[2] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[3] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[3] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[3] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[4] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[4] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[4] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[4] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[4] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[5] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[5] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[5] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[5] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[5] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[6] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[6] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[6] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[6] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[6] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[6] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r9, %[r], #0\n\t" + "umlal r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "ldr r12, [%[a], #28]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[7] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[7] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[7] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[7] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[7] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[7] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r10, %[r], #0\n\t" + "umlal r9, r10, r12, lr\n\t" + "ldr %[r], [sp, #32]\n\t" + "add %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #36\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #44\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[r], [sp, #36]\n\t" + "str %[a], [sp, #40]\n\t" +#else + "strd %[r], %[a], [sp, #36]\n\t" +#endif + "mov lr, %[b]\n\t" + "ldm %[a], {%[r], %[a], %[b], r3}\n\t" + "ldm lr!, {r4, r5, r6}\n\t" + "umull r10, r11, %[r], r4\n\t" + "umull r12, r7, %[a], r4\n\t" + "umaal r11, r12, %[r], r5\n\t" + "umull r8, r9, %[b], r4\n\t" + "umaal r12, r8, %[a], r5\n\t" + "umaal r12, r7, %[r], r6\n\t" + "umaal r8, r9, r3, r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + "umaal r7, r8, %[b], r5\n\t" + "ldm lr!, {r4}\n\t" + "umull r10, r11, %[a], r6\n\t" + "umaal r8, r9, %[b], r6\n\t" + "umaal r7, r10, %[r], r4\n\t" + "umaal r8, r11, r3, r5\n\t" + "str r7, [sp, #12]\n\t" + "umaal r8, r10, %[a], r4\n\t" + "umaal r9, r11, r3, r6\n\t" + "umaal r9, r10, %[b], r4\n\t" + "umaal r10, r11, r3, r4\n\t" + "ldm lr, {r4, r5, r6, r7}\n\t" + "mov r12, #0\n\t" + "umlal r8, r12, %[r], r4\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r10, r12, %[b], r4\n\t" + "umaal r11, r12, r3, r4\n\t" + "mov r4, #0\n\t" + "umlal r9, r4, %[r], r5\n\t" + "umaal r10, r4, %[a], r5\n\t" + "umaal r11, r4, %[b], r5\n\t" + "umaal r12, r4, r3, r5\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, %[r], r6\n\t" + "umaal r11, r5, %[a], r6\n\t" + "umaal r12, r5, %[b], r6\n\t" + "umaal r4, r5, r3, r6\n\t" + "mov r6, #0\n\t" + "umlal r11, r6, %[r], r7\n\t" + "ldr %[r], [sp, #40]\n\t" + "umaal r12, r6, %[a], r7\n\t" + "add %[r], %[r], #16\n\t" + "umaal r4, r6, %[b], r7\n\t" + "sub lr, lr, #16\n\t" + "umaal r5, r6, r3, r7\n\t" + "ldm %[r], {%[r], %[a], %[b], r3}\n\t" + "str r6, [sp, #32]\n\t" + "ldm lr!, {r6}\n\t" + "mov r7, #0\n\t" + "umlal r8, r7, %[r], r6\n\t" + "umaal r9, r7, %[a], r6\n\t" + "str r8, [sp, #16]\n\t" + "umaal r10, r7, %[b], r6\n\t" + "umaal r11, r7, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r8, #0\n\t" + "umlal r9, r8, %[r], r6\n\t" + "umaal r10, r8, %[a], r6\n\t" + "str r9, [sp, #20]\n\t" + "umaal r11, r8, %[b], r6\n\t" + "umaal r12, r8, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r9, #0\n\t" + "umlal r10, r9, %[r], r6\n\t" + "umaal r11, r9, %[a], r6\n\t" + "str r10, [sp, #24]\n\t" + "umaal r12, r9, %[b], r6\n\t" + "umaal r4, r9, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r10, #0\n\t" + "umlal r11, r10, %[r], r6\n\t" + "umaal r12, r10, %[a], r6\n\t" + "str r11, [sp, #28]\n\t" + "umaal r4, r10, %[b], r6\n\t" + "umaal r5, r10, r3, r6\n\t" + "ldm lr!, {r11}\n\t" + "umaal r12, r7, %[r], r11\n\t" + "umaal r4, r7, %[a], r11\n\t" + "ldr r6, [sp, #32]\n\t" + "umaal r5, r7, %[b], r11\n\t" + "umaal r6, r7, r3, r11\n\t" + "ldm lr!, {r11}\n\t" + "umaal r4, r8, %[r], r11\n\t" + "umaal r5, r8, %[a], r11\n\t" + "umaal r6, r8, %[b], r11\n\t" + "umaal r7, r8, r3, r11\n\t" + "ldm lr, {r11, lr}\n\t" + "umaal r5, r9, %[r], r11\n\t" + "umaal r6, r10, %[r], lr\n\t" + "umaal r6, r9, %[a], r11\n\t" + "umaal r7, r10, %[a], lr\n\t" + "umaal r7, r9, %[b], r11\n\t" + "umaal r8, r10, %[b], lr\n\t" + "umaal r8, r9, r3, r11\n\t" + "umaal r9, r10, r3, lr\n\t" + "mov r3, r12\n\t" + "ldr lr, [sp, #36]\n\t" + "add lr, lr, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub lr, lr, #32\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #44\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); +} + +#endif #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Square a and put result in r. (r = a * a) @@ -63153,12 +63390,11 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p */ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -63167,7 +63403,7 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "L_sp_256_sqr_8_outer_%=: \n\t" "subs r3, r5, #28\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_256_sqr_8_inner_%=: \n\t" @@ -63175,7 +63411,7 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_256_sqr_8_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -63228,7 +63464,7 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_256_sqr_8_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -63282,11 +63518,12 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_256_sqr_8_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } #else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -63294,14 +63531,13 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) */ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #32\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -63312,15 +63548,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r2, r2, #17\n\t" "adds r8, r8, r2\n\t" "adc r3, r3, r9\n\t" -#else - "umull r8, r3, r10, r10\n\t" -#endif "mov r4, #0\n\t" "str r8, [sp]\n\t" /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63361,22 +63593,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [sp, #4]\n\t" /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63417,20 +63637,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63450,17 +63658,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [sp, #8]\n\t" /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63501,21 +63702,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63555,20 +63744,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "str r2, [sp, #12]\n\t" /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63609,21 +63788,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63663,18 +63830,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63694,17 +63851,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [sp, #16]\n\t" /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63726,15 +63876,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r3, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63763,16 +63909,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63801,12 +63940,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -63817,7 +63950,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63839,15 +63971,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r4, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63876,16 +64004,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63914,15 +64035,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63945,15 +64059,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "adds r2, r2, r5\n\t" "adcs r3, r3, r6\n\t" "adc r4, r4, r7\n\t" @@ -63961,7 +64066,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -63983,15 +64087,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r2, #0\n\t" "mov r7, #0\n\t" /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64020,16 +64120,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64058,16 +64151,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64096,12 +64182,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -64112,7 +64192,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64134,15 +64213,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r3, #0\n\t" "mov r7, #0\n\t" /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64171,16 +64246,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64209,15 +64277,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64240,15 +64301,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adc r7, r7, r7\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "adds r4, r4, r5\n\t" "adcs r2, r2, r6\n\t" "adc r3, r3, r7\n\t" @@ -64256,7 +64308,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64278,15 +64329,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #16\n\t" "adds r5, r5, r8\n\t" "adc r6, r6, r9\n\t" -#else - "umull r5, r6, r10, r12\n\t" -#endif "mov r4, #0\n\t" "mov r7, #0\n\t" /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64315,16 +64362,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64353,12 +64393,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r5, r5, r8\n\t" "adcs r6, r6, r9\n\t" "adc r7, r7, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r5, r5, r8\n\t" - "adcs r6, r6, r9\n\t" - "adc r7, r7, #0\n\t" -#endif "adds r5, r5, r5\n\t" "adcs r6, r6, r6\n\t" "adc r7, r7, r7\n\t" @@ -64369,7 +64403,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64410,21 +64443,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64464,18 +64485,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64495,17 +64506,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [%[r], #40]\n\t" /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64546,21 +64550,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64600,20 +64592,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r4, r4, r8\n\t" "adcs r2, r2, r9\n\t" "adc r3, r3, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r8\n\t" - "adcs r2, r2, r9\n\t" - "adc r3, r3, #0\n\t" -#endif "str r4, [%[r], #44]\n\t" /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64654,20 +64636,8 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64687,17 +64657,10 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r2, r2, r8\n\t" "adcs r3, r3, r9\n\t" "adc r4, r4, #0\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adc r4, r4, #0\n\t" -#endif "str r2, [%[r], #48]\n\t" /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64738,21 +64701,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adds r3, r3, r8\n\t" "adcs r4, r4, r9\n\t" "adc r2, r2, #0\n\t" -#else - "umull r8, r9, r10, r12\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "mov r2, #0\n\t" - "adc r2, r2, #0\n\t" -#endif "str r3, [%[r], #52]\n\t" /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -64770,11 +64721,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "lsl r8, r8, #17\n\t" "adds r4, r4, r8\n\t" "adc r2, r2, r9\n\t" -#else - "umull r8, r9, r10, r10\n\t" - "adds r4, r4, r8\n\t" - "adc r2, r2, r9\n\t" -#endif "str r4, [%[r], #56]\n\t" "str r2, [%[r], #60]\n\t" "ldm sp!, {r2, r3, r4, r8}\n\t" @@ -64787,6 +64733,366 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) ); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str %[r], [sp, #64]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * A[1] */ + "ldr lr, [%[a], #4]\n\t" + "umull r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "umull r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "umull r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "umull r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[0] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[0] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + "adcs r3, r3, #0\n\t" + "str r4, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[a], #12]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[a], #16]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[a], #20]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[a], #24]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[a], #28]\n\t" + "mov r9, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "stm lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, %[r], #0\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov lr, sp\n\t" + /* A[0] * A[0] */ + "ldr r12, [%[a]]\n\t" + "umull r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[1] * A[1] */ + "ldr r12, [%[a], #4]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * A[2] */ + "ldr r12, [%[a], #8]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * A[3] */ + "ldr r12, [%[a], #12]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, r12\n\t" + "adds r10, r10, r11\n\t" + "stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "ldr r12, [%[a], #16]\n\t" + "adcs r3, r3, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * A[5] */ + "ldr r12, [%[a], #20]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * A[6] */ + "ldr r12, [%[a], #24]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * A[7] */ + "ldr r12, [%[a], #28]\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r12, r12\n\t" + "ldr %[r], [sp, #64]\n\t" + "add %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "sub %[r], %[r], #32\n\t" + "stm %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #32\n\t" + "str %[r], [sp, #28]\n\t" + "ldm %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "umull r9, r10, %[r], %[r]\n\t" + "umull r11, r12, %[r], %[a]\n\t" + "adds r11, r11, r11\n\t" + "mov lr, #0\n\t" + "umaal r10, r11, lr, lr\n\t" + "stm sp, {r9, r10}\n\t" + "mov r8, lr\n\t" + "umaal r8, r12, %[r], r2\n\t" + "adcs r8, r8, r8\n\t" + "umaal r8, r11, %[a], %[a]\n\t" + "umull r9, r10, %[r], r3\n\t" + "umaal r9, r12, %[a], r2\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #8]\n\t" + "str r9, [sp, #12]\n\t" +#else + "strd r8, r9, [sp, #8]\n\t" +#endif + "mov r9, lr\n\t" + "umaal r9, r10, %[r], r4\n\t" + "umaal r9, r12, %[a], r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r2, r2\n\t" + "str r9, [sp, #16]\n\t" + "umull r9, r8, %[r], r5\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r9, r10, r2, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" + "str r9, [sp, #20]\n\t" + "mov r9, lr\n\t" + "umaal r9, r8, %[r], r6\n\t" + "umaal r9, r12, %[a], r5\n\t" + "umaal r9, r10, r2, r4\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r3, r3\n\t" + "str r9, [sp, #24]\n\t" + "umull %[r], r9, %[r], r7\n\t" + "umaal %[r], r8, %[a], r6\n\t" + "umaal %[r], r12, r2, r5\n\t" + "umaal %[r], r10, r3, r4\n\t" + "adcs %[r], %[r], %[r]\n\t" + "umaal %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "umaal r9, r8, %[a], r7\n\t" + "umaal r9, r10, r2, r6\n\t" + "umaal r12, r9, r3, r5\n\t" + "adcs r12, r12, r12\n\t" + "umaal r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "umaal r9, r8, r2, r7\n\t" + "umaal r10, r9, r3, r6\n\t" + "mov r2, lr\n\t" + "umaal r10, r2, r4, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "umaal r2, r8, r3, r7\n\t" + "umaal r2, r9, r4, r6\n\t" + "adcs r3, r2, r2\n\t" + "umaal r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "mov %[a], lr\n\t" + "umaal %[a], r8, r4, r7\n\t" + "umaal %[a], r9, r5, r6\n\t" + "adcs r4, %[a], %[a]\n\t" + "umaal r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "umaal r8, r9, r5, r7\n\t" + "adcs r8, r8, r8\n\t" + "umaal r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "mov r5, lr\n\t" + "umaal r5, r9, r6, r7\n\t" + "adcs r5, r5, r5\n\t" + "umaal r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "adcs r9, r9, r9\n\t" + "umaal r9, r5, r7, r7\n\t" + "adcs r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "ldr lr, [sp, #28]\n\t" + "add lr, lr, #28\n\t" + "stm lr!, {%[r], r12}\n\t" + "stm lr!, {r11}\n\t" + "stm lr!, {r10}\n\t" + "stm lr!, {r3, r4, r8, r9}\n\t" + "stm lr!, {r7}\n\t" + "sub lr, lr, #0x40\n\t" + "ldm sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "stm lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "add sp, sp, #32\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) @@ -64797,9 +65103,9 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -64835,12 +65141,11 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* */ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -64855,10 +65160,11 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -64873,9 +65179,9 @@ static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* */ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -64910,9 +65216,9 @@ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* */ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -64946,8 +65252,8 @@ static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* */ static int sp_256_mod_mul_norm_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #24\n\t" @@ -65184,14 +65490,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -65369,6 +65675,7 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm) return err; } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -65380,9 +65687,9 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm) */ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x44\n\t" @@ -65390,7 +65697,6 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co /* A[0] * B[0] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r8, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65412,13 +65718,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r8, r8, r3\n\t" "adc r9, r9, r4\n\t" -#else - "umull r8, r9, r6, r7\n\t" -#endif "str r8, [sp]\n\t" /* A[0] * B[1] */ "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65444,15 +65746,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r9, r9, r3\n\t" "adc r10, r10, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adc r10, r4, #0\n\t" -#endif /* A[1] * B[0] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65481,16 +65777,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif "str r9, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r6, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65515,15 +65804,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r10, r10, r3\n\t" "adc lr, lr, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adc lr, r4, lr\n\t" -#endif /* A[1] * B[1] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65552,16 +65835,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, #0\n\t" -#endif /* A[0] * B[2] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65590,16 +65866,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif "str r10, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65628,16 +65897,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif /* A[1] * B[2] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65666,16 +65928,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[2] * B[1] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65704,16 +65959,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[3] * B[0] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65742,16 +65990,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif "str lr, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r6, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65780,16 +66021,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif /* A[3] * B[1] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65818,16 +66052,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[2] * B[2] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65856,16 +66083,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[1] * B[3] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65894,16 +66114,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[0] * B[4] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65932,16 +66145,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif "str r8, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -65970,16 +66176,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif /* A[1] * B[4] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66008,16 +66207,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[2] * B[3] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66046,16 +66238,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[3] * B[2] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66084,16 +66269,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[4] * B[1] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66122,16 +66300,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[5] * B[0] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66160,16 +66331,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif "str r9, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r6, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66198,16 +66362,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, #0\n\t" -#endif /* A[5] * B[1] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66236,16 +66393,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[4] * B[2] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66274,16 +66424,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[3] * B[3] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66312,16 +66455,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[2] * B[4] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66350,16 +66486,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[1] * B[5] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66388,16 +66517,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[0] * B[6] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66426,16 +66548,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif "str r10, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r7, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66464,16 +66579,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif /* A[1] * B[6] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66502,16 +66610,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[2] * B[5] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66540,16 +66641,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[3] * B[4] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66578,16 +66672,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[4] * B[3] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66616,16 +66703,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[5] * B[2] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66654,16 +66734,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[6] * B[1] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66692,16 +66765,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[7] * B[0] */ "ldr r6, [%[a], #28]\n\t" "ldr r7, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66730,16 +66796,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif "str lr, [sp, #28]\n\t" /* A[7] * B[1] */ "ldr r7, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66768,16 +66827,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif /* A[6] * B[2] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66806,16 +66858,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[5] * B[3] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66844,16 +66889,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[4] * B[4] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66882,16 +66920,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[3] * B[5] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66920,16 +66951,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[2] * B[6] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66958,16 +66982,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[1] * B[7] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -66996,16 +67013,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif "str r8, [sp, #32]\n\t" /* A[2] * B[7] */ "ldr r6, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67034,16 +67044,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif /* A[3] * B[6] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67072,16 +67075,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[4] * B[5] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67110,16 +67106,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[5] * B[4] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67148,16 +67137,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[6] * B[3] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67186,16 +67168,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[7] * B[2] */ "ldr r6, [%[a], #28]\n\t" "ldr r7, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67224,16 +67199,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif "str r9, [sp, #36]\n\t" /* A[7] * B[3] */ "ldr r7, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67262,16 +67230,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, #0\n\t" -#endif /* A[6] * B[4] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67300,16 +67261,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[5] * B[5] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67338,16 +67292,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[4] * B[6] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67376,16 +67323,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[3] * B[7] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67414,16 +67354,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif "str r10, [sp, #40]\n\t" /* A[4] * B[7] */ "ldr r6, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67452,16 +67385,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif /* A[5] * B[6] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67490,16 +67416,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[6] * B[5] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67528,16 +67447,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[7] * B[4] */ "ldr r6, [%[a], #28]\n\t" "ldr r7, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67566,16 +67478,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif "str lr, [sp, #44]\n\t" /* A[7] * B[5] */ "ldr r7, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67604,16 +67509,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif /* A[6] * B[6] */ "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67642,16 +67540,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[5] * B[7] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67680,15 +67571,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[6] * B[7] */ "ldr r6, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67717,16 +67601,9 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif /* A[7] * B[6] */ "ldr r6, [%[a], #28]\n\t" "ldr r7, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67755,15 +67632,8 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[7] * B[7] */ "ldr r7, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67788,175 +67658,880 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r10, r10, r3\n\t" "adc lr, lr, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adc lr, r4, lr\n\t" -#endif "str r8, [sp, #48]\n\t" "str r9, [sp, #52]\n\t" "str r10, [sp, #56]\n\t" "str lr, [sp, #60]\n\t" + "str %[r], [sp, #64]\n\t" /* Start Reduction */ - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" - "ldr r10, [sp, #24]\n\t" - "ldr lr, [sp, #28]\n\t" - /* mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192 */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ /* - a[0] << 224 */ - /* + (a[0..1] * 2) << (6 * 32) */ - "adds r10, r10, r4\n\t" - "adc lr, lr, r5\n\t" - "adds r10, r10, r4\n\t" - "adc lr, lr, r5\n\t" + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" /* - a[0] << (7 * 32) */ - "sub lr, lr, r4\n\t" - /* + a[0..4] << (3 * 32) */ - "mov %[a], r7\n\t" - "mov r12, r8\n\t" - "adds r7, r7, r4\n\t" - "adcs r8, r8, r5\n\t" + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" "adcs r9, r9, r6\n\t" - "adcs r10, r10, %[a]\n\t" - "adc lr, lr, r12\n\t" - "str r7, [sp, #12]\n\t" - "str r8, [sp, #16]\n\t" - "str r9, [sp, #20]\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" /* a += mu * m */ /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ - /* a[6] += t[0] + t[3] */ - /* a[7] += t[1] + t[4] */ - "ldr %[a], [sp, #24]\n\t" - "ldr %[b], [sp, #28]\n\t" - "adds %[a], %[a], r4\n\t" - "adcs %[b], %[b], r5\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r7\n\t" - "adcs %[b], %[b], r8\n\t" - "adc r12, r12, #0\n\t" - "str r10, [sp, #24]\n\t" - "str lr, [sp, #28]\n\t" - "str %[b], [sp, #64]\n\t" - /* a[8] += t[0] + t[2] + t[5] */ - /* a[9] += t[1] + t[3] + t[6] */ - /* a[10] += t[2] + t[4] + t[7] */ - "ldr %[a], [sp, #32]\n\t" - "ldr %[b], [sp, #36]\n\t" - "ldr r3, [sp, #40]\n\t" - "adds %[a], %[a], r12\n\t" - "adcs %[b], %[b], #0\n\t" - "adcs r3, r3, #0\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r4\n\t" - "adcs %[b], %[b], r5\n\t" - "adcs r3, r3, r6\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r6\n\t" - "adcs %[b], %[b], r7\n\t" + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" + "adcs r2, r2, r7\n\t" "adcs r3, r3, r8\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r9\n\t" - "adcs %[b], %[b], r10\n\t" - "adcs r3, r3, lr\n\t" - "adc r12, r12, #0\n\t" - "str %[a], [sp, #32]\n\t" - "str %[b], [sp, #36]\n\t" - "str r3, [sp, #40]\n\t" - /* a[11] += t[3] + t[5] */ - /* a[12] += t[4] + t[6] */ - /* a[13] += t[5] + t[7] */ - /* a[14] += t[6] */ - /* a[15] += t[7] */ - "ldr %[a], [sp, #44]\n\t" - "ldr %[b], [sp, #48]\n\t" - "ldr r3, [sp, #52]\n\t" - "ldr r4, [sp, #56]\n\t" - "ldr r5, [sp, #60]\n\t" - "adds %[a], %[a], r12\n\t" - "adcs %[b], %[b], #0\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r7\n\t" - "adcs %[b], %[b], r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adcs r5, r5, lr\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r9\n\t" - "adcs %[b], %[b], r10\n\t" - "adcs r3, r3, lr\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adc r12, r12, #0\n\t" - "str %[a], [sp, #44]\n\t" - "str %[b], [sp, #48]\n\t" - "str r3, [sp, #52]\n\t" - "str r4, [sp, #56]\n\t" - "str r5, [sp, #60]\n\t" - "ldr %[a], [sp, #64]\n\t" - "ldr %[b], [sp, #32]\n\t" - "ldr r3, [sp, #36]\n\t" - "ldr r4, [sp, #40]\n\t" - "ldr r8, [sp]\n\t" - "ldr r9, [sp, #4]\n\t" - "ldr r10, [sp, #8]\n\t" - "ldr lr, [sp, #12]\n\t" - "subs %[a], %[a], r8\n\t" - "sbcs %[b], %[b], r9\n\t" - "sbcs r3, r3, r10\n\t" - "sbcs r4, r4, lr\n\t" - "str %[b], [sp, #32]\n\t" - "str r3, [sp, #36]\n\t" - "str r4, [sp, #40]\n\t" - "ldr %[a], [sp, #44]\n\t" - "ldr %[b], [sp, #48]\n\t" - "ldr r3, [sp, #52]\n\t" - "ldr r4, [sp, #56]\n\t" - "ldr r5, [sp, #60]\n\t" - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" - "ldr r10, [sp, #24]\n\t" - "ldr lr, [sp, #28]\n\t" - "sbcs %[a], %[a], r8\n\t" - "sbcs %[b], %[b], r9\n\t" - "sbcs r3, r3, r10\n\t" - "sbcs r4, r4, lr\n\t" - "sbc r5, r5, #0\n\t" - /* mask m and sub from result if overflow */ - "rsb r12, r12, #0\n\t" - "and lr, r12, #1\n\t" - "ldr r6, [sp, #32]\n\t" - "ldr r7, [sp, #36]\n\t" - "ldr r8, [sp, #40]\n\t" - "subs r6, r6, r12\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" + "adcs r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" "sbcs r7, r7, r12\n\t" - "sbcs r8, r8, r12\n\t" - "sbcs %[a], %[a], #0\n\t" - "sbcs %[b], %[b], #0\n\t" - "sbcs r3, r3, #0\n\t" - "sbcs r4, r4, lr\n\t" - "sbc r5, r5, r12\n\t" - "stm %[r]!, {r6, r7, r8}\n\t" - "stm %[r]!, {%[a], %[b], r3, r4, r5}\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #64]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" "add sp, sp, #0x44\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r12" ); - (void)mp_p; (void)m_p; (void)mp_p; } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Multiply two Montgomery form numbers mod the modulus (prime). + * (r = a * b mod m) + * + * r Result of multiplication. + * a First number to multiply in Montgomery form. + * b Second number to multiply in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str %[r], [sp, #64]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "ldr lr, [%[b]]\n\t" + "umull r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "umull r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "umull r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "umull r9, r10, r12, lr\n\t" + "str r3, [sp]\n\t" + /* A[0] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "mov r11, %[r]\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[0] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adcs r6, r6, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[0] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adcs r8, r8, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[0] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adcs r10, r10, #0\n\t" + "adc r3, %[r], #0\n\t" + "umlal r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "str r4, [sp, #4]\n\t" + "adds r5, r5, r11\n\t" + /* A[1] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[1] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "str r5, [sp, #8]\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[2] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[2] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[3] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[3] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[3] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[4] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[4] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[4] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[4] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[4] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[5] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[5] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[5] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[5] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[5] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[6] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[6] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[6] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[6] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[6] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[6] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r9, %[r], #0\n\t" + "umlal r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "ldr r12, [%[a], #28]\n\t" + "ldr lr, [%[b]]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[7] * B[1] */ + "ldr lr, [%[b], #4]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[7] * B[2] */ + "ldr lr, [%[b], #8]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[7] * B[3] */ + "ldr lr, [%[b], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[7] * B[4] */ + "ldr lr, [%[b], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[7] * B[5] */ + "ldr lr, [%[b], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * B[6] */ + "ldr lr, [%[b], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[7] * B[7] */ + "ldr lr, [%[b], #28]\n\t" + "adc r10, %[r], #0\n\t" + "umlal r9, r10, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* Start Reduction */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" + "adcs r9, r9, r6\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" + "adcs r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" + "adcs r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r12\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #64]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#else +/* Multiply two Montgomery form numbers mod the modulus (prime). + * (r = a * b mod m) + * + * r Result of multiplication. + * a First number to multiply in Montgomery form. + * b Second number to multiply in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x4c\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str %[r], [sp, #68]\n\t" + "str %[a], [sp, #72]\n\t" +#else + "strd %[r], %[a], [sp, #68]\n\t" +#endif + "mov lr, %[b]\n\t" + "ldm %[a], {%[r], %[a], %[b], r3}\n\t" + "ldm lr!, {r4, r5, r6}\n\t" + "umull r10, r11, %[r], r4\n\t" + "umull r12, r7, %[a], r4\n\t" + "umaal r11, r12, %[r], r5\n\t" + "umull r8, r9, %[b], r4\n\t" + "umaal r12, r8, %[a], r5\n\t" + "umaal r12, r7, %[r], r6\n\t" + "umaal r8, r9, r3, r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + "umaal r7, r8, %[b], r5\n\t" + "ldm lr!, {r4}\n\t" + "umull r10, r11, %[a], r6\n\t" + "umaal r8, r9, %[b], r6\n\t" + "umaal r7, r10, %[r], r4\n\t" + "umaal r8, r11, r3, r5\n\t" + "str r7, [sp, #12]\n\t" + "umaal r8, r10, %[a], r4\n\t" + "umaal r9, r11, r3, r6\n\t" + "umaal r9, r10, %[b], r4\n\t" + "umaal r10, r11, r3, r4\n\t" + "ldm lr, {r4, r5, r6, r7}\n\t" + "mov r12, #0\n\t" + "umlal r8, r12, %[r], r4\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r10, r12, %[b], r4\n\t" + "umaal r11, r12, r3, r4\n\t" + "mov r4, #0\n\t" + "umlal r9, r4, %[r], r5\n\t" + "umaal r10, r4, %[a], r5\n\t" + "umaal r11, r4, %[b], r5\n\t" + "umaal r12, r4, r3, r5\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, %[r], r6\n\t" + "umaal r11, r5, %[a], r6\n\t" + "umaal r12, r5, %[b], r6\n\t" + "umaal r4, r5, r3, r6\n\t" + "mov r6, #0\n\t" + "umlal r11, r6, %[r], r7\n\t" + "ldr %[r], [sp, #72]\n\t" + "umaal r12, r6, %[a], r7\n\t" + "add %[r], %[r], #16\n\t" + "umaal r4, r6, %[b], r7\n\t" + "sub lr, lr, #16\n\t" + "umaal r5, r6, r3, r7\n\t" + "ldm %[r], {%[r], %[a], %[b], r3}\n\t" + "str r6, [sp, #64]\n\t" + "ldm lr!, {r6}\n\t" + "mov r7, #0\n\t" + "umlal r8, r7, %[r], r6\n\t" + "umaal r9, r7, %[a], r6\n\t" + "str r8, [sp, #16]\n\t" + "umaal r10, r7, %[b], r6\n\t" + "umaal r11, r7, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r8, #0\n\t" + "umlal r9, r8, %[r], r6\n\t" + "umaal r10, r8, %[a], r6\n\t" + "str r9, [sp, #20]\n\t" + "umaal r11, r8, %[b], r6\n\t" + "umaal r12, r8, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r9, #0\n\t" + "umlal r10, r9, %[r], r6\n\t" + "umaal r11, r9, %[a], r6\n\t" + "str r10, [sp, #24]\n\t" + "umaal r12, r9, %[b], r6\n\t" + "umaal r4, r9, r3, r6\n\t" + "ldm lr!, {r6}\n\t" + "mov r10, #0\n\t" + "umlal r11, r10, %[r], r6\n\t" + "umaal r12, r10, %[a], r6\n\t" + "str r11, [sp, #28]\n\t" + "umaal r4, r10, %[b], r6\n\t" + "umaal r5, r10, r3, r6\n\t" + "ldm lr!, {r11}\n\t" + "umaal r12, r7, %[r], r11\n\t" + "umaal r4, r7, %[a], r11\n\t" + "ldr r6, [sp, #64]\n\t" + "umaal r5, r7, %[b], r11\n\t" + "umaal r6, r7, r3, r11\n\t" + "ldm lr!, {r11}\n\t" + "umaal r4, r8, %[r], r11\n\t" + "umaal r5, r8, %[a], r11\n\t" + "umaal r6, r8, %[b], r11\n\t" + "umaal r7, r8, r3, r11\n\t" + "ldm lr, {r11, lr}\n\t" + "umaal r5, r9, %[r], r11\n\t" + "umaal r6, r10, %[r], lr\n\t" + "umaal r6, r9, %[a], r11\n\t" + "umaal r7, r10, %[a], lr\n\t" + "umaal r7, r9, %[b], r11\n\t" + "umaal r8, r10, %[b], lr\n\t" + "umaal r8, r9, r3, r11\n\t" + "umaal r9, r10, r3, lr\n\t" + "mov r3, r12\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* Start Reduction */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" + "adcs r9, r9, r6\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" + "adcs r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" + "adcs r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r12\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #68]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x4c\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#endif +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) * * r Result of squaring. @@ -67966,16 +68541,16 @@ static SP_NOINLINE void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, co */ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" "sub sp, sp, #0x44\n\t" "mov r5, #0\n\t" /* A[0] * A[1] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r9, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -67997,13 +68572,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r9, r9, r3\n\t" "adc r10, r10, r4\n\t" -#else - "umull r9, r10, r6, r7\n\t" -#endif "str r9, [sp, #4]\n\t" /* A[0] * A[2] */ "ldr r7, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68029,15 +68600,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r10, r10, r3\n\t" "adc lr, lr, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adc lr, r4, #0\n\t" -#endif "str r10, [sp, #8]\n\t" /* A[0] * A[3] */ "ldr r7, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68063,15 +68628,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds lr, lr, r3\n\t" "adc r8, r8, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adc r8, r4, #0\n\t" -#endif /* A[1] * A[2] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68100,16 +68659,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif "str lr, [sp, #12]\n\t" /* A[1] * A[3] */ "ldr r7, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68134,15 +68686,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r8, r8, r3\n\t" "adc r9, r9, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adc r9, r4, r9\n\t" -#endif /* A[0] * A[4] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68171,16 +68717,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif "str r8, [sp, #16]\n\t" /* A[0] * A[5] */ "ldr r7, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68205,15 +68744,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r9, r9, r3\n\t" "adc r10, r10, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adc r10, r4, r10\n\t" -#endif /* A[1] * A[4] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68242,16 +68775,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif /* A[2] * A[3] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68280,16 +68806,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif "str r9, [sp, #20]\n\t" /* A[2] * A[4] */ "ldr r7, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68318,16 +68837,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, #0\n\t" -#endif /* A[1] * A[5] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68356,16 +68868,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif /* A[0] * A[6] */ "ldr r6, [%[a]]\n\t" "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68394,16 +68899,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif "str r10, [sp, #24]\n\t" /* A[0] * A[7] */ "ldr r7, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68432,16 +68930,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif /* A[1] * A[6] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68470,16 +68961,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[2] * A[5] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68508,16 +68992,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif /* A[3] * A[4] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68546,16 +69023,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif "str lr, [sp, #28]\n\t" /* A[3] * A[5] */ "ldr r7, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68584,16 +69054,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif /* A[2] * A[6] */ "ldr r6, [%[a], #8]\n\t" "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68622,16 +69085,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif /* A[1] * A[7] */ "ldr r6, [%[a], #4]\n\t" "ldr r7, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68660,16 +69116,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" -#endif "str r8, [sp, #32]\n\t" /* A[2] * A[7] */ "ldr r6, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68698,16 +69147,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, #0\n\t" -#endif /* A[3] * A[6] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68736,16 +69178,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif /* A[4] * A[5] */ "ldr r6, [%[a], #16]\n\t" "ldr r7, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68774,16 +69209,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r9, r9, r3\n\t" "adcs r10, r10, r4\n\t" "adc lr, lr, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r4, r10\n\t" - "adc lr, r5, lr\n\t" -#endif "str r9, [sp, #36]\n\t" /* A[4] * A[6] */ "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68812,16 +69240,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, #0\n\t" -#endif /* A[3] * A[7] */ "ldr r6, [%[a], #12]\n\t" "ldr r7, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68850,16 +69271,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r10, r10, r3\n\t" "adcs lr, lr, r4\n\t" "adc r8, r8, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r10, r10, r3\n\t" - "adcs lr, r4, lr\n\t" - "adc r8, r5, r8\n\t" -#endif "str r10, [sp, #40]\n\t" /* A[4] * A[7] */ "ldr r6, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68888,16 +69302,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, #0\n\t" -#endif /* A[5] * A[6] */ "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68926,16 +69333,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds lr, lr, r3\n\t" "adcs r8, r8, r4\n\t" "adc r9, r9, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds lr, lr, r3\n\t" - "adcs r8, r4, r8\n\t" - "adc r9, r5, r9\n\t" -#endif "str lr, [sp, #44]\n\t" /* A[5] * A[7] */ "ldr r7, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68964,16 +69364,9 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "adds r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" "adc r10, r10, #0\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r8, r8, r3\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" -#endif "str r8, [sp, #48]\n\t" /* A[6] * A[7] */ "ldr r6, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r3, r6, #16\n\t" "lsl r4, r7, #16\n\t" "lsr r3, r3, #16\n\t" @@ -68998,11 +69391,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r3, r3, #16\n\t" "adds r9, r9, r3\n\t" "adc r10, r10, r4\n\t" -#else - "umull r3, r4, r6, r7\n\t" - "adds r9, r9, r3\n\t" - "adc r10, r4, r10\n\t" -#endif "str r9, [sp, #52]\n\t" "str r10, [sp, #56]\n\t" /* Double */ @@ -69055,7 +69443,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "ldr r12, [sp, #12]\n\t" /* A[0] * A[0] */ "ldr r6, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69066,12 +69453,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adds r8, r8, r6\n\t" "adc r9, r9, r7\n\t" -#else - "umull r8, r9, r6, r6\n\t" -#endif /* A[1] * A[1] */ "ldr r6, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69082,9 +69465,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adds r10, r10, r6\n\t" "adc lr, lr, r7\n\t" -#else - "umull r10, lr, r6, r6\n\t" -#endif "adds r9, r9, r4\n\t" "adcs r10, r10, r5\n\t" "adcs lr, lr, r12\n\t" @@ -69098,7 +69478,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "ldr r12, [sp, #28]\n\t" /* A[2] * A[2] */ "ldr r6, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69109,12 +69488,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adcs r8, r8, r6\n\t" "adc r9, r9, r7\n\t" -#else - "umull r8, r9, r6, r6\n\t" -#endif /* A[3] * A[3] */ "ldr r6, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69125,10 +69500,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adds r10, r10, r6\n\t" "adc lr, lr, r7\n\t" -#else - "umull r10, lr, r6, r6\n\t" -#endif -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "adcs r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" #else @@ -69147,7 +69519,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "ldr r12, [sp, #44]\n\t" /* A[4] * A[4] */ "ldr r6, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69158,12 +69529,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adcs r8, r8, r6\n\t" "adc r9, r9, r7\n\t" -#else - "umull r8, r9, r6, r6\n\t" -#endif /* A[5] * A[5] */ "ldr r6, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69174,10 +69541,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adds r10, r10, r6\n\t" "adc lr, lr, r7\n\t" -#else - "umull r10, lr, r6, r6\n\t" -#endif -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "adcs r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" #else @@ -69196,7 +69560,6 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "ldr r12, [sp, #60]\n\t" /* A[6] * A[6] */ "ldr r6, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69207,12 +69570,8 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adcs r8, r8, r6\n\t" "adc r9, r9, r7\n\t" -#else - "umull r8, r9, r6, r6\n\t" -#endif /* A[7] * A[7] */ "ldr r6, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "lsr r6, r6, #16\n\t" @@ -69223,10 +69582,7 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "lsl r6, r6, #17\n\t" "adds r10, r10, r6\n\t" "adc lr, lr, r7\n\t" -#else - "umull r10, lr, r6, r6\n\t" -#endif -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "adcs r8, r8, r3\n\t" "adcs r9, r9, r4\n\t" #else @@ -69239,156 +69595,131 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co "str r9, [sp, #52]\n\t" "str r10, [sp, #56]\n\t" "str lr, [sp, #60]\n\t" + "str %[r], [sp, #64]\n\t" /* Start Reduction */ - "ldr r4, [sp]\n\t" - "ldr r5, [sp, #4]\n\t" - "ldr r6, [sp, #8]\n\t" - "ldr r7, [sp, #12]\n\t" - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" - "ldr r10, [sp, #24]\n\t" - "ldr lr, [sp, #28]\n\t" - /* mu = a[0..7] + a[0..4] << 96 + (a[0..1] * 2) << 192 */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ /* - a[0] << 224 */ - /* + (a[0..1] * 2) << (6 * 32) */ - "adds r10, r10, r4\n\t" - "adc lr, lr, r5\n\t" - "adds r10, r10, r4\n\t" - "adc lr, lr, r5\n\t" + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" /* - a[0] << (7 * 32) */ - "sub lr, lr, r4\n\t" - /* + a[0..4] << (3 * 32) */ - "mov %[a], r7\n\t" - "mov r12, r8\n\t" - "adds r7, r7, r4\n\t" - "adcs r8, r8, r5\n\t" + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" "adcs r9, r9, r6\n\t" - "adcs r10, r10, %[a]\n\t" - "adc lr, lr, r12\n\t" - "str r7, [sp, #12]\n\t" - "str r8, [sp, #16]\n\t" - "str r9, [sp, #20]\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" /* a += mu * m */ /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ - /* a[6] += t[0] + t[3] */ - /* a[7] += t[1] + t[4] */ - "ldr %[a], [sp, #24]\n\t" - "ldr r2, [sp, #28]\n\t" - "adds %[a], %[a], r4\n\t" - "adcs r2, r2, r5\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r7\n\t" - "adcs r2, r2, r8\n\t" - "adc r12, r12, #0\n\t" - "str r10, [sp, #24]\n\t" - "str lr, [sp, #28]\n\t" - "str r2, [sp, #64]\n\t" - /* a[8] += t[0] + t[2] + t[5] */ - /* a[9] += t[1] + t[3] + t[6] */ - /* a[10] += t[2] + t[4] + t[7] */ - "ldr %[a], [sp, #32]\n\t" - "ldr r2, [sp, #36]\n\t" - "ldr r3, [sp, #40]\n\t" - "adds %[a], %[a], r12\n\t" - "adcs r2, r2, #0\n\t" - "adcs r3, r3, #0\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r4\n\t" - "adcs r2, r2, r5\n\t" - "adcs r3, r3, r6\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r6\n\t" + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" "adcs r2, r2, r7\n\t" "adcs r3, r3, r8\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r9\n\t" - "adcs r2, r2, r10\n\t" - "adcs r3, r3, lr\n\t" - "adc r12, r12, #0\n\t" - "str %[a], [sp, #32]\n\t" - "str r2, [sp, #36]\n\t" - "str r3, [sp, #40]\n\t" - /* a[11] += t[3] + t[5] */ - /* a[12] += t[4] + t[6] */ - /* a[13] += t[5] + t[7] */ - /* a[14] += t[6] */ - /* a[15] += t[7] */ - "ldr %[a], [sp, #44]\n\t" - "ldr r2, [sp, #48]\n\t" - "ldr r3, [sp, #52]\n\t" - "ldr r4, [sp, #56]\n\t" - "ldr r5, [sp, #60]\n\t" - "adds %[a], %[a], r12\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" "adcs r2, r2, #0\n\t" "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "mov r12, #0\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r7\n\t" - "adcs r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adcs r5, r5, lr\n\t" - "adc r12, r12, #0\n\t" - "adds %[a], %[a], r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" "adcs r2, r2, r10\n\t" - "adcs r3, r3, lr\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adc r12, r12, #0\n\t" - "str %[a], [sp, #44]\n\t" - "str r2, [sp, #48]\n\t" - "str r3, [sp, #52]\n\t" - "str r4, [sp, #56]\n\t" - "str r5, [sp, #60]\n\t" - "ldr %[a], [sp, #64]\n\t" - "ldr r2, [sp, #32]\n\t" - "ldr r3, [sp, #36]\n\t" - "ldr r4, [sp, #40]\n\t" - "ldr r8, [sp]\n\t" - "ldr r9, [sp, #4]\n\t" - "ldr r10, [sp, #8]\n\t" - "ldr lr, [sp, #12]\n\t" - "subs %[a], %[a], r8\n\t" - "sbcs r2, r2, r9\n\t" - "sbcs r3, r3, r10\n\t" - "sbcs r4, r4, lr\n\t" - "str r2, [sp, #32]\n\t" - "str r3, [sp, #36]\n\t" - "str r4, [sp, #40]\n\t" - "ldr %[a], [sp, #44]\n\t" - "ldr r2, [sp, #48]\n\t" - "ldr r3, [sp, #52]\n\t" - "ldr r4, [sp, #56]\n\t" - "ldr r5, [sp, #60]\n\t" - "ldr r8, [sp, #16]\n\t" - "ldr r9, [sp, #20]\n\t" - "ldr r10, [sp, #24]\n\t" - "ldr lr, [sp, #28]\n\t" - "sbcs %[a], %[a], r8\n\t" - "sbcs r2, r2, r9\n\t" - "sbcs r3, r3, r10\n\t" - "sbcs r4, r4, lr\n\t" - "sbc r5, r5, #0\n\t" - /* mask m and sub from result if overflow */ - "rsb r12, r12, #0\n\t" - "and lr, r12, #1\n\t" - "ldr r6, [sp, #32]\n\t" - "ldr r7, [sp, #36]\n\t" - "ldr r8, [sp, #40]\n\t" - "subs r6, r6, r12\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" "sbcs r7, r7, r12\n\t" - "sbcs r8, r8, r12\n\t" - "sbcs %[a], %[a], #0\n\t" - "sbcs r2, r2, #0\n\t" - "sbcs r3, r3, #0\n\t" - "sbcs r4, r4, lr\n\t" - "sbc r5, r5, r12\n\t" - "stm %[r]!, {r6, r7, r8}\n\t" - "stm %[r]!, {%[a], r2, r3, r4, r5}\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #64]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" "add sp, sp, #0x44\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -69398,6 +69729,615 @@ static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, co (void)mp_p; } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) + * + * r Result of squaring. + * a Number to square in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str %[r], [sp, #64]\n\t" + "mov %[r], #0\n\t" + "ldr r12, [%[a]]\n\t" + /* A[0] * A[1] */ + "ldr lr, [%[a], #4]\n\t" + "umull r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "umull r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "umull r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "umull r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[0] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "adds r8, r8, r11\n\t" + /* A[0] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + "adcs r3, r3, #0\n\t" + "str r4, [sp, #4]\n\t" + "str r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "ldr r12, [%[a], #4]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "str r6, [sp, #12]\n\t" + "adds r7, r7, r11\n\t" + /* A[1] * A[3] */ + "ldr lr, [%[a], #12]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, lr\n\t" + "str r7, [sp, #16]\n\t" + "adds r8, r8, r11\n\t" + /* A[1] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "adds r9, r9, r11\n\t" + /* A[1] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "adds r10, r10, r11\n\t" + /* A[1] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[1] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r4, %[r], #0\n\t" + "umlal r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "ldr r12, [%[a], #8]\n\t" + "ldr lr, [%[a], #12]\n\t" + "mov r11, #0\n\t" + "umlal r8, r11, r12, lr\n\t" + "str r8, [sp, #20]\n\t" + "adds r9, r9, r11\n\t" + /* A[2] * A[4] */ + "ldr lr, [%[a], #16]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, lr\n\t" + "str r9, [sp, #24]\n\t" + "adds r10, r10, r11\n\t" + /* A[2] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "adds r3, r3, r11\n\t" + /* A[2] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[2] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r5, %[r], #0\n\t" + "umlal r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "ldr r12, [%[a], #12]\n\t" + "ldr lr, [%[a], #16]\n\t" + "mov r11, #0\n\t" + "umlal r10, r11, r12, lr\n\t" + "str r10, [sp, #28]\n\t" + "adds r3, r3, r11\n\t" + /* A[3] * A[5] */ + "ldr lr, [%[a], #20]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, lr\n\t" + "adds r4, r4, r11\n\t" + /* A[3] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[3] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r6, %[r], #0\n\t" + "umlal r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "ldr r12, [%[a], #16]\n\t" + "ldr lr, [%[a], #20]\n\t" + "mov r11, #0\n\t" + "umlal r4, r11, r12, lr\n\t" + "adds r5, r5, r11\n\t" + /* A[4] * A[6] */ + "ldr lr, [%[a], #24]\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, lr\n\t" + "adds r6, r6, r11\n\t" + /* A[4] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r7, %[r], #0\n\t" + "umlal r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "ldr r12, [%[a], #20]\n\t" + "ldr lr, [%[a], #24]\n\t" + "mov r11, #0\n\t" + "umlal r6, r11, r12, lr\n\t" + "adds r7, r7, r11\n\t" + /* A[5] * A[7] */ + "ldr lr, [%[a], #28]\n\t" + "adc r8, %[r], #0\n\t" + "umlal r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "ldr r12, [%[a], #24]\n\t" + "ldr lr, [%[a], #28]\n\t" + "mov r9, #0\n\t" + "umlal r8, r9, r12, lr\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "adds r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "stm lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "adcs r3, r3, r3\n\t" + "adcs r4, r4, r4\n\t" + "adcs r5, r5, r5\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adc r10, %[r], #0\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "add lr, sp, #4\n\t" + "ldm lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "mov lr, sp\n\t" + /* A[0] * A[0] */ + "ldr r12, [%[a]]\n\t" + "umull r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[1] * A[1] */ + "ldr r12, [%[a], #4]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[2] * A[2] */ + "ldr r12, [%[a], #8]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[3] * A[3] */ + "ldr r12, [%[a], #12]\n\t" + "adcs r9, r9, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r9, r11, r12, r12\n\t" + "adds r10, r10, r11\n\t" + "stm lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ldm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "ldr r12, [%[a], #16]\n\t" + "adcs r3, r3, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r3, r11, r12, r12\n\t" + "adds r4, r4, r11\n\t" + /* A[5] * A[5] */ + "ldr r12, [%[a], #20]\n\t" + "adcs r5, r5, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r5, r11, r12, r12\n\t" + "adds r6, r6, r11\n\t" + /* A[6] * A[6] */ + "ldr r12, [%[a], #24]\n\t" + "adcs r7, r7, #0\n\t" + "adc r11, %[r], #0\n\t" + "umlal r7, r11, r12, r12\n\t" + "adds r8, r8, r11\n\t" + /* A[7] * A[7] */ + "ldr r12, [%[a], #28]\n\t" + "adcs r9, r9, #0\n\t" + "adc r10, r10, #0\n\t" + "umlal r9, r10, r12, r12\n\t" + "add lr, sp, #32\n\t" + "stm lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* Start Reduction */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" + "adcs r9, r9, r6\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" + "adcs r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" + "adcs r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r12\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #64]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#else +/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) + * + * r Result of squaring. + * a Number to square in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static SP_NOINLINE void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "sub sp, sp, #0x44\n\t" + "str %[r], [sp, #64]\n\t" + "ldm %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "umull r9, r10, %[r], %[r]\n\t" + "umull r11, r12, %[r], %[a]\n\t" + "adds r11, r11, r11\n\t" + "mov lr, #0\n\t" + "umaal r10, r11, lr, lr\n\t" + "stm sp, {r9, r10}\n\t" + "mov r8, lr\n\t" + "umaal r8, r12, %[r], r2\n\t" + "adcs r8, r8, r8\n\t" + "umaal r8, r11, %[a], %[a]\n\t" + "umull r9, r10, %[r], r3\n\t" + "umaal r9, r12, %[a], r2\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "str r8, [sp, #8]\n\t" + "str r9, [sp, #12]\n\t" +#else + "strd r8, r9, [sp, #8]\n\t" +#endif + "mov r9, lr\n\t" + "umaal r9, r10, %[r], r4\n\t" + "umaal r9, r12, %[a], r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r2, r2\n\t" + "str r9, [sp, #16]\n\t" + "umull r9, r8, %[r], r5\n\t" + "umaal r9, r12, %[a], r4\n\t" + "umaal r9, r10, r2, r3\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, lr, lr\n\t" + "str r9, [sp, #20]\n\t" + "mov r9, lr\n\t" + "umaal r9, r8, %[r], r6\n\t" + "umaal r9, r12, %[a], r5\n\t" + "umaal r9, r10, r2, r4\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r11, r3, r3\n\t" + "str r9, [sp, #24]\n\t" + "umull %[r], r9, %[r], r7\n\t" + "umaal %[r], r8, %[a], r6\n\t" + "umaal %[r], r12, r2, r5\n\t" + "umaal %[r], r10, r3, r4\n\t" + "adcs %[r], %[r], %[r]\n\t" + "umaal %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "umaal r9, r8, %[a], r7\n\t" + "umaal r9, r10, r2, r6\n\t" + "umaal r12, r9, r3, r5\n\t" + "adcs r12, r12, r12\n\t" + "umaal r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "umaal r9, r8, r2, r7\n\t" + "umaal r10, r9, r3, r6\n\t" + "mov r2, lr\n\t" + "umaal r10, r2, r4, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "umaal r2, r8, r3, r7\n\t" + "umaal r2, r9, r4, r6\n\t" + "adcs r3, r2, r2\n\t" + "umaal r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "mov %[a], lr\n\t" + "umaal %[a], r8, r4, r7\n\t" + "umaal %[a], r9, r5, r6\n\t" + "adcs r4, %[a], %[a]\n\t" + "umaal r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "umaal r8, r9, r5, r7\n\t" + "adcs r8, r8, r8\n\t" + "umaal r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "mov r5, lr\n\t" + "umaal r5, r9, r6, r7\n\t" + "adcs r5, r5, r5\n\t" + "umaal r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "adcs r9, r9, r9\n\t" + "umaal r9, r5, r7, r7\n\t" + "adcs r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "mov lr, sp\n\t" + "add lr, lr, #28\n\t" + "stm lr!, {%[r], r12}\n\t" + "stm lr!, {r11}\n\t" + "stm lr!, {r10}\n\t" + "stm lr!, {r3, r4, r8, r9}\n\t" + "stm lr!, {r7}\n\t" + /* Start Reduction */ + "ldm sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "str %[r], [sp]\n\t" + "mov r3, r11\n\t" + "mov r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + "adds r11, r11, r5\n\t" + "adc r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "sub r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "mov r0, r8\n\t" + "mov r1, r9\n\t" + "mov r2, r10\n\t" + "adds r8, r8, r5\n\t" + "adcs r9, r9, r6\n\t" + "adcs r10, r10, r7\n\t" + "adcs r11, r11, r0\n\t" + "adc r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "adds r0, r0, r5\n\t" + "adcs r1, r1, r6\n\t" + "adcs r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r3, r3, r5\n\t" + "adcs r4, r4, r6\n\t" + "adc lr, lr, #0\n\t" + "str r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "add r0, sp, #32\n\t" + "ldm r0, {r2, r3, r4}\n\t" + "adds r2, r2, lr\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r5\n\t" + "adcs r3, r3, r6\n\t" + "adcs r4, r4, r7\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r7\n\t" + "adcs r3, r3, r8\n\t" + "adcs r4, r4, r9\n\t" + "adc lr, lr, #0\n\t" + "adds r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "stm r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "ldm r0, {r0, r1, r2, r3, r4}\n\t" + "adds r0, r0, lr\n\t" + "adcs r1, r1, #0\n\t" + "adcs r2, r2, #0\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "mov lr, #0\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r8\n\t" + "adcs r1, r1, r9\n\t" + "adcs r2, r2, r10\n\t" + "adcs r3, r3, r11\n\t" + "adcs r4, r4, r12\n\t" + "adc lr, lr, #0\n\t" + "adds r0, r0, r10\n\t" + "adcs r1, r1, r11\n\t" + "adcs r2, r2, r12\n\t" + "adcs r3, r3, #0\n\t" + "adcs r4, r4, #0\n\t" + "adc lr, lr, #0\n\t" + "str r0, [sp, #44]\n\t" + "str r1, [sp, #48]\n\t" + "str r2, [sp, #52]\n\t" + "str r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "add r0, sp, #28\n\t" + "ldm r0, {r0, r1, r2, r3}\n\t" + "subs r0, r0, r5\n\t" + "sbcs r1, r1, r6\n\t" + "sbcs r2, r2, r7\n\t" + "sbcs r3, r3, r8\n\t" + "add r0, sp, #44\n\t" + "mov r8, r4\n\t" + "ldm r0, {r4, r5, r6, r7}\n\t" + "sbcs r4, r4, r9\n\t" + "sbcs r5, r5, r10\n\t" + "sbcs r6, r6, r11\n\t" + "sbcs r7, r7, r12\n\t" + "sbcs r8, r8, #0\n\t" + "sbc lr, lr, #0\n\t" + /* mask m and sub from result if overflow */ + "rsb lr, lr, #0\n\t" + "subs r1, r1, lr\n\t" + "sbcs r2, r2, lr\n\t" + "sbcs r3, r3, lr\n\t" + "sbcs r4, r4, #0\n\t" + "sbcs r5, r5, #0\n\t" + "sbcs r6, r6, #0\n\t" + "sbcs r7, r7, lr, LSR #31\n\t" + "sbc r8, r8, lr\n\t" + "ldr %[r], [sp, #64]\n\t" + "stm %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "add sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#endif #if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY) /* Square the Montgomery form number a number of times. (r = a ^ n mod m) * @@ -69503,8 +70443,8 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td) */ static sp_int32 sp_256_cmp_8(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -69645,10 +70585,10 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -69685,10 +70625,10 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d */ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -69732,6 +70672,7 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d #ifndef WOLFSSL_SP_SMALL #define sp_256_mont_reduce_order_8 sp_256_mont_reduce_8 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 256 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -69740,12 +70681,12 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_d */ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -69758,10 +70699,9 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -69785,14 +70725,8 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -69816,18 +70750,12 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -69851,17 +70779,11 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -69885,18 +70807,12 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -69920,18 +70836,12 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -69955,18 +70865,12 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -69990,22 +70894,16 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #28]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #28]\n\t" +#else + "ldr r7, [%[m], #28]\n\t" #endif "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -70036,13 +70934,6 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "ldr r10, [%[a], #32]\n\t" @@ -70054,6 +70945,110 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, "add %[a], %[a], #4\n\t" "cmp r9, #32\n\t" "blt L_sp_256_mont_reduce_8_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_256_mont_reduce_8_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "ldr r10, [%[a], #32]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #32\n\t" + "blt L_sp_256_mont_reduce_8_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -70073,11 +71068,97 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, */ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_256_mont_reduce_8_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #32]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #28]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #32]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #32\n\t" + "blt L_sp_256_mont_reduce_8_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#endif +#else +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( "mov r1, #0\n\t" - /* # i = 0 */ + /* i = 0 */ "mov r8, #0\n\t" "\n" "L_sp_256_mont_reduce_8_word_%=: \n\t" @@ -70170,6 +71251,7 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, (void)mp_p; } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 256 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -70178,12 +71260,12 @@ static SP_NOINLINE void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, */ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -70196,10 +71278,9 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -70223,14 +71304,8 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -70254,18 +71329,12 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -70289,17 +71358,11 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -70323,18 +71386,12 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -70358,18 +71415,12 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -70393,18 +71444,12 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -70428,22 +71473,16 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #28]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #28]\n\t" +#else + "ldr r7, [%[m], #28]\n\t" #endif "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -70474,13 +71513,6 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "ldr r10, [%[a], #32]\n\t" @@ -70492,6 +71524,7 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit "add %[a], %[a], #4\n\t" "cmp r9, #32\n\t" "blt L_sp_256_mont_reduce_order_8_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -70502,6 +71535,195 @@ static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_256_mont_reduce_order_8_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "ldr r10, [%[a], #32]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #32\n\t" + "blt L_sp_256_mont_reduce_order_8_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_256_mont_reduce_order_8_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #32]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #28]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #32]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #32\n\t" + "blt L_sp_256_mont_reduce_order_8_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#endif #endif /* WOLFSSL_SP_SMALL */ /* Map the Montgomery form projective coordinate point to an affine point. * @@ -70552,41 +71774,49 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p, */ static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "ldm %[b]!, {r4, r5, r6, r7}\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, r7\n\t" - "stm %[r], {r8, r9, r10, r11}\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "ldm %[b]!, {r4, r5, r6, r7}\n\t" + "mov lr, #0\n\t" + "ldm %[a], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "adds r5, r5, r3\n\t" + "adcs r6, r6, r4\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "adcs r7, r7, r3\n\t" "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, r7\n\t" - "adc r3, r12, #0\n\t" - "sub r3, r12, r3\n\t" - "and r12, r3, #1\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r3\n\t" - "sbcs r6, r6, r3\n\t" - "sbcs r7, r7, #0\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "adcs r9, r9, r3\n\t" + "adcs r10, r10, r4\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "adcs r11, r11, r3\n\t" + "adcs r12, r12, r4\n\t" + "adc lr, lr, #0\n\t" + "rsb lr, lr, #0\n\t" + "subs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" "sbcs r8, r8, #0\n\t" "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, r12\n\t" - "sbc r11, r11, r3\n\t" - "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "sbcs r10, r10, #0\n\t" + "sbcs r11, r11, lr, LSR #31\n\t" + "sbcs r12, r12, lr\n\t" + "sbc %[b], %[b], %[b]\n\t" + "sub lr, lr, %[b]\n\t" + "subs r5, r5, lr\n\t" + "sbcs r6, r6, lr\n\t" + "sbcs r7, r7, lr\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, #0\n\t" + "sbcs r11, r11, lr, LSR #31\n\t" + "sbc r12, r12, lr\n\t" + "stm %[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); (void)m_p; } @@ -70599,11 +71829,11 @@ static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" + "mov r2, #0\n\t" "ldm %[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "adds r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" @@ -70613,21 +71843,30 @@ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "adcs r9, r9, r9\n\t" "adcs r10, r10, r10\n\t" "adcs r11, r11, r11\n\t" - "adc r2, r3, #0\n\t" - "sub r2, r3, r2\n\t" - "and r3, r2, #1\n\t" + "adc r2, r2, #0\n\t" + "rsb r2, r2, #0\n\t" "subs r4, r4, r2\n\t" "sbcs r5, r5, r2\n\t" "sbcs r6, r6, r2\n\t" "sbcs r7, r7, #0\n\t" "sbcs r8, r8, #0\n\t" "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, r3\n\t" + "sbcs r10, r10, r2, LSR #31\n\t" + "sbcs r11, r11, r2\n\t" + "sbc %[a], %[a], %[a]\n\t" + "sub r2, r2, %[a]\n\t" + "subs r4, r4, r2\n\t" + "sbcs r5, r5, r2\n\t" + "sbcs r6, r6, r2\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, r2, LSR #31\n\t" "sbc r11, r11, r2\n\t" "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2" ); (void)m_p; } @@ -70640,11 +71879,11 @@ static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" + "mov r12, #0\n\t" "ldm %[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "adds r4, r4, r4\n\t" "adcs r5, r5, r5\n\t" @@ -70654,59 +71893,54 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "adcs r9, r9, r9\n\t" "adcs r10, r10, r10\n\t" "adcs r11, r11, r11\n\t" - "adc r2, r3, #0\n\t" - "sub r2, r3, r2\n\t" - "and r3, r2, #1\n\t" - "subs r4, r4, r2\n\t" - "sbcs r5, r5, r2\n\t" - "sbcs r6, r6, r2\n\t" + "adc r12, r12, #0\n\t" + "rsb r12, r12, #0\n\t" + "subs r4, r4, r12\n\t" + "sbcs r5, r5, r12\n\t" + "sbcs r6, r6, r12\n\t" "sbcs r7, r7, #0\n\t" "sbcs r8, r8, #0\n\t" "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, r3\n\t" - "sbc r11, r11, r2\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" -#else - "strd r8, r9, [%[r], #16]\n\t" -#endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" -#else - "strd r10, r11, [%[r], #24]\n\t" -#endif - "mov r3, #0\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, r7\n\t" - "stm %[r]!, {r8, r9, r10, r11}\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, r7\n\t" - "sub %[r], %[r], #16\n\t" - "adc r2, r3, #0\n\t" - "sub r2, r3, r2\n\t" - "and r3, r2, #1\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "subs r4, r4, r2\n\t" - "sbcs r5, r5, r2\n\t" - "sbcs r6, r6, r2\n\t" + "sbcs r10, r10, r12, LSR #31\n\t" + "sbcs r11, r11, r12\n\t" + "rsb r12, r12, #0\n\t" + "sbc r12, r12, #0\n\t" + "ldm %[a]!, {r2, r3}\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "ldm %[a]!, {r2, r3}\n\t" + "adcs r6, r6, r2\n\t" + "adcs r7, r7, r3\n\t" + "ldm %[a]!, {r2, r3}\n\t" + "adcs r8, r8, r2\n\t" + "adcs r9, r9, r3\n\t" + "ldm %[a]!, {r2, r3}\n\t" + "adcs r10, r10, r2\n\t" + "adcs r11, r11, r3\n\t" + "adc r12, r12, #0\n\t" + "rsb r12, r12, #0\n\t" + "subs r4, r4, r12\n\t" + "sbcs r5, r5, r12\n\t" + "sbcs r6, r6, r12\n\t" "sbcs r7, r7, #0\n\t" "sbcs r8, r8, #0\n\t" "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, r3\n\t" - "sbc r11, r11, r2\n\t" + "sbcs r10, r10, r12, LSR #31\n\t" + "sbcs r11, r11, r12\n\t" + "sbc r2, r2, r2\n\t" + "sub r12, r12, r2\n\t" + "subs r4, r4, r12\n\t" + "sbcs r5, r5, r12\n\t" + "sbcs r6, r6, r12\n\t" + "sbcs r7, r7, #0\n\t" + "sbcs r8, r8, #0\n\t" + "sbcs r9, r9, #0\n\t" + "sbcs r10, r10, r12, LSR #31\n\t" + "sbc r11, r11, r12\n\t" "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3", "r12" ); (void)m_p; } @@ -70720,46 +71954,51 @@ static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "ldm %[b]!, {r4, r5, r6, r7}\n\t" - "subs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r11, r11, r7\n\t" - "stm %[r]!, {r8, r9, r10, r11}\n\t" - "ldm %[a]!, {r8, r9, r10, r11}\n\t" - "ldm %[b]!, {r4, r5, r6, r7}\n\t" + "mov lr, #0\n\t" + "ldm %[a], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "subs r5, r5, r3\n\t" + "sbcs r6, r6, r4\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "sbcs r7, r7, r3\n\t" "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r11, r11, r7\n\t" - "sbc r3, r12, #0\n\t" - "sub %[r], %[r], #16\n\t" - "and r12, r3, #1\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adds r4, r4, r3\n\t" - "adcs r5, r5, r3\n\t" - "adcs r6, r6, r3\n\t" - "adcs r7, r7, #0\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "sbcs r9, r9, r3\n\t" + "sbcs r10, r10, r4\n\t" + "ldm %[b]!, {r3, r4}\n\t" + "sbcs r11, r11, r3\n\t" + "sbcs r12, r12, r4\n\t" + "sbc lr, lr, #0\n\t" + "adds r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" "adcs r8, r8, #0\n\t" "adcs r9, r9, #0\n\t" - "adcs r10, r10, r12\n\t" - "adc r11, r11, r3\n\t" - "stm %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, lr, LSR #31\n\t" + "adcs r12, r12, lr\n\t" + "adc lr, lr, #0\n\t" + "adds r5, r5, lr\n\t" + "adcs r6, r6, lr\n\t" + "adcs r7, r7, lr\n\t" + "adcs r8, r8, #0\n\t" + "adcs r9, r9, #0\n\t" + "adcs r10, r10, #0\n\t" + "adcs r11, r11, lr, LSR #31\n\t" + "adc r12, r12, lr\n\t" + "stm %[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); (void)m_p; } -#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -70768,38 +72007,38 @@ static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "ldm %[a], {r4, r5, r6, r7}\n\t" "and r3, r4, #1\n\t" - "sub r8, r10, r3\n\t" + "rsb r8, r3, #0\n\t" "and r9, r8, #1\n\t" "adds r4, r4, r8\n\t" "adcs r5, r5, r8\n\t" "adcs r6, r6, r8\n\t" - "adcs r7, r7, r10\n\t" + "adcs r7, r7, #0\n\t" "stm %[r], {r4, r5, r6, r7}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #16]\n\t" "ldr r5, [%[a], #20]\n\t" #else "ldrd r4, r5, [%[a], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[a], #24]\n\t" "ldr r7, [%[a], #28]\n\t" #else "ldrd r6, r7, [%[a], #24]\n\t" #endif - "adcs r4, r4, r10\n\t" - "adcs r5, r5, r10\n\t" + "adcs r4, r4, #0\n\t" + "adcs r5, r5, #0\n\t" "adcs r6, r6, r9\n\t" "adcs r7, r7, r8\n\t" - "adc r3, r10, r10\n\t" + "mov r3, #0\n\t" + "adc r3, r3, #0\n\t" "lsr r8, r4, #1\n\t" "lsr r9, r5, #1\n\t" "lsr r10, r6, #1\n\t" @@ -70809,13 +72048,13 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_ "orr r10, r10, r7, lsl #31\n\t" "orr r11, r11, r3, lsl #31\n\t" "mov r3, r4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #16]\n\t" "str r9, [%[r], #20]\n\t" #else "strd r8, r9, [%[r], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [%[r], #24]\n\t" "str r11, [%[r], #28]\n\t" #else @@ -70891,7 +72130,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -71013,7 +72252,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -71074,12 +72313,12 @@ static int sp_256_iszero_8(const sp_digit* a) static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*8; + sp_digit* t2 = t + 4*8; + sp_digit* t3 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); @@ -71101,17 +72340,9 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); @@ -71130,20 +72361,31 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_mont_dbl_8(t3, y, p256_mod); sp_256_mont_sub_8(x, x, t3, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -71189,12 +72431,12 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*8; - ctx->t3 = t + 4*8; - ctx->t4 = t + 6*8; - ctx->t5 = t + 8*8; - ctx->t6 = t + 10*8; + ctx->t6 = t; + ctx->t1 = t + 2*8; + ctx->t2 = t + 4*8; + ctx->t3 = t + 6*8; + ctx->t4 = t + 8*8; + ctx->t5 = t + 10*8; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -71301,7 +72543,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -71314,22 +72556,28 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -71601,8 +72849,6 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC -#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 -#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -71641,7 +72887,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -71650,8 +72896,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -71671,7 +72917,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -71680,8 +72926,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -71737,12 +72983,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*8; + sp_digit* t6 = t + 4*8; + sp_digit* t1 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -71758,13 +73004,9 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_8(t2, t2, p->x, p256_mod); @@ -71773,33 +73015,40 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(x, t1, t5, p256_mod); - sp_256_mont_dbl_8(t1, t3, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); + sp_256_mont_sqr_8(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(t2, t2, t1, p256_mod); + sp_256_mont_dbl_8(t5, t3, p256_mod); + sp_256_mont_sub_8(x, t2, t5, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); + sp_256_mont_sub_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(y, t3, t5, p256_mod); + sp_256_mont_mul_8(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -72715,7 +73964,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -74275,7 +75524,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -74336,7 +75585,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, */ static void sp_256_add_one_8(sp_digit* a_p) { - register sp_digit* a asm ("r0") = a_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r1, r2, r3, r4}\n\t" @@ -74448,7 +75697,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -74456,7 +75705,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -74734,16 +75983,15 @@ int sp_ecc_secret_gen_256_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, */ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #32\n\t" "\n" "L_sp_256_sub_in_pkace_8_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -74751,13 +75999,13 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_256_sub_in_pkace_8_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -74770,8 +76018,8 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -74806,15 +76054,14 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) */ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -74847,7 +76094,7 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_256_mul_d_8_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -74892,7 +76139,7 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #32]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -74905,15 +76152,14 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -74938,251 +76184,11 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -75208,15 +76214,205 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[2] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" "str r5, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -75233,9 +76429,9 @@ static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -75292,9 +76488,9 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -75324,7 +76520,7 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_256_word_8_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -75352,7 +76548,7 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -75380,7 +76576,7 @@ static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -75638,7 +76834,7 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_8(t2, t3, 4); /* t = a^ff = t2 * t3 */ sp_256_mont_mul_order_8(t, t2, t3); - /* t3= a^ff00 = t ^ 2 ^ 8 */ + /* t2= a^ff00 = t ^ 2 ^ 8 */ sp_256_mont_sqr_n_order_8(t2, t, 8); /* t = a^ffff = t2 * t */ sp_256_mont_mul_order_8(t, t2, t); @@ -75655,7 +76851,11 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_8(t2, t2, t); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + sp_256_mont_sqr_order_8(t2, t2); + sp_256_mont_mul_order_8(t2, t2, a); + sp_256_mont_sqr_n_order_8(t2, t2, 5); + sp_256_mont_mul_order_8(t2, t2, t3); + for (i=121; i>=112; i--) { sp_256_mont_sqr_order_8(t2, t2); if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) { sp_256_mont_mul_order_8(t2, t2, a); @@ -76062,19 +77262,18 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W #ifndef WOLFSSL_SP_SMALL static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r11, #0\n\t" "mov r12, #0\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r2, [%[a], #16]\n\t" "ldr r3, [%[a], #20]\n\t" #else "ldrd r2, r3, [%[a], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #24]\n\t" "ldr r5, [%[a], #28]\n\t" #else @@ -76089,25 +77288,25 @@ static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) "orr r8, r8, r5, lsl #31\n\t" "orr r9, r9, r12, lsl #31\n\t" "mov r12, r2\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r], #16]\n\t" "str r7, [%[r], #20]\n\t" #else "strd r6, r7, [%[r], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #24]\n\t" "str r9, [%[r], #28]\n\t" #else "strd r8, r9, [%[r], #24]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r2, [%[a]]\n\t" "ldr r3, [%[a], #4]\n\t" #else "ldrd r2, r3, [%[a]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #8]\n\t" "ldr r5, [%[a], #12]\n\t" #else @@ -76121,13 +77320,13 @@ static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) "orr r7, r7, r4, lsl #31\n\t" "orr r8, r8, r5, lsl #31\n\t" "orr r9, r9, r12, lsl #31\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r6, [%[r]]\n\t" "str r7, [%[r], #4]\n\t" #else "strd r6, r7, [%[r]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #8]\n\t" "str r9, [%[r], #12]\n\t" #else @@ -76135,7 +77334,7 @@ static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) #endif : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10", "r11" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" ); } @@ -76147,13 +77346,13 @@ static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) */ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( "mov r12, #0\n\t" - "ldr r4, [%[a]], #4\n\t" + "ldm %[a]!, {r4}\n\t" "ands r3, r4, #1\n\t" "beq L_sp_256_div2_mod_8_even_%=\n\t" "ldm %[a]!, {r5, r6, r7}\n\t" @@ -76173,13 +77372,13 @@ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "b L_sp_256_div2_mod_8_div2_%=\n\t" "\n" "L_sp_256_div2_mod_8_even_%=: \n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a], #12]\n\t" "ldr r5, [%[a], #16]\n\t" #else "ldrd r4, r5, [%[a], #12]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[a], #20]\n\t" "ldr r7, [%[a], #24]\n\t" #else @@ -76197,13 +77396,13 @@ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit "orr r10, r10, r7, lsl #31\n\t" "orr r11, r11, r3, lsl #31\n\t" "mov r3, r4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r8, [%[r], #16]\n\t" "str r9, [%[r], #20]\n\t" #else "strd r8, r9, [%[r], #16]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r10, [%[r], #24]\n\t" "str r11, [%[r], #28]\n\t" #else @@ -76225,7 +77424,7 @@ static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit ); } -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) static const unsigned char L_sp_256_num_bits_8_table[] = { 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, @@ -76263,7 +77462,8 @@ static const unsigned char L_sp_256_num_bits_8_table[] = { static int sp_256_num_bits_8(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register unsigned char* L_sp_256_num_bits_8_table_c asm ("r1") = (unsigned char*)&L_sp_256_num_bits_8_table; __asm__ __volatile__ ( "mov lr, %[L_sp_256_num_bits_8_table]\n\t" @@ -76575,9 +77775,9 @@ static int sp_256_num_bits_8(const sp_digit* a_p) "\n" "L_sp_256_num_bits_8_9_%=: \n\t" "mov %[a], r12\n\t" - : [a] "+r" (a) - : [L_sp_256_num_bits_8_table] "r" (L_sp_256_num_bits_8_table) - : "memory", "r1", "r2", "r3", "r12", "lr" + : [a] "+r" (a), [L_sp_256_num_bits_8_table] "+r" (L_sp_256_num_bits_8_table_c) + : + : "memory", "r2", "r3", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -76585,13 +77785,13 @@ static int sp_256_num_bits_8(const sp_digit* a_p) #else static int sp_256_num_bits_8(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldr r1, [%[a], #28]\n\t" "cmp r1, #0\n\t" "beq L_sp_256_num_bits_8_7_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -76671,7 +77871,7 @@ static int sp_256_num_bits_8(const sp_digit* a_p) return (uint32_t)(size_t)a; } -#endif /* WOLFSSL_SP_ARM_ARCH && (WOLFSSL_SP_ARM_ARCH < 7) */ +#endif /* WOLFSSL_ARM_ARCH && (WOLFSSL_ARM_ARCH < 7) */ /* Non-constant time modular inversion. * * @param [out] r Resulting number. @@ -77764,9 +78964,9 @@ static const sp_digit p384_b[12] = { */ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" @@ -77784,7 +78984,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "L_sp_384_mul_12_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -77856,17 +79056,16 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ */ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #48\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -77896,7 +79095,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -77935,7 +79134,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -77973,7 +79172,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78013,7 +79212,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78051,7 +79250,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78089,7 +79288,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78128,7 +79327,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78165,7 +79364,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78203,7 +79402,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78241,7 +79440,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78280,7 +79479,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78318,7 +79517,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78356,7 +79555,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78394,7 +79593,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78432,7 +79631,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78472,7 +79671,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78509,7 +79708,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78546,7 +79745,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78584,7 +79783,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78622,7 +79821,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78660,7 +79859,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78700,7 +79899,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78737,7 +79936,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78775,7 +79974,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78813,7 +80012,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78851,7 +80050,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78889,7 +80088,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78927,7 +80126,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -78967,7 +80166,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79005,7 +80204,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79042,7 +80241,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79079,7 +80278,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79117,7 +80316,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79155,7 +80354,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79193,7 +80392,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79231,7 +80430,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #28]\n\t" /* A[8] * B[0] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79271,7 +80470,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[1] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79309,7 +80508,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79346,7 +80545,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79384,7 +80583,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79422,7 +80621,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79460,7 +80659,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79498,7 +80697,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79536,7 +80735,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[8] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79574,7 +80773,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #32]\n\t" /* A[0] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79614,7 +80813,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[8] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79652,7 +80851,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79690,7 +80889,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79727,7 +80926,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79764,7 +80963,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79802,7 +81001,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79840,7 +81039,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79878,7 +81077,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[1] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79916,7 +81115,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[0] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79954,7 +81153,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #36]\n\t" /* A[10] * B[0] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -79994,7 +81193,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[1] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80032,7 +81231,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[2] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80070,7 +81269,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[3] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80107,7 +81306,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80145,7 +81344,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80183,7 +81382,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80221,7 +81420,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80259,7 +81458,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[8] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80297,7 +81496,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[9] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80335,7 +81534,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[10] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80373,7 +81572,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #40]\n\t" /* A[0] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80413,7 +81612,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[10] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80451,7 +81650,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[9] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80489,7 +81688,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[8] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80527,7 +81726,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80564,7 +81763,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80601,7 +81800,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80639,7 +81838,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80677,7 +81876,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[3] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80715,7 +81914,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[2] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80753,7 +81952,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[1] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80791,7 +81990,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[0] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80829,7 +82028,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #44]\n\t" /* A[11] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80869,7 +82068,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[2] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80907,7 +82106,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[3] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80945,7 +82144,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[4] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -80982,7 +82181,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[5] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81020,7 +82219,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81058,7 +82257,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81096,7 +82295,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[8] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81134,7 +82333,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[9] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81172,7 +82371,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[10] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81210,7 +82409,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[11] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81248,7 +82447,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #48]\n\t" /* A[2] * B[11] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81288,7 +82487,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[10] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81326,7 +82525,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[9] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81364,7 +82563,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[8] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81401,7 +82600,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81438,7 +82637,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81476,7 +82675,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[5] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81514,7 +82713,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[4] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81552,7 +82751,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[3] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81590,7 +82789,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[2] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81628,7 +82827,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #52]\n\t" /* A[11] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81668,7 +82867,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[4] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81706,7 +82905,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[5] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81743,7 +82942,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[6] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81781,7 +82980,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[7] */ "ldr r11, [%[a], #28]\n\t" "ldr r12, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81819,7 +83018,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[8] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81857,7 +83056,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[9] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81895,7 +83094,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[10] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81933,7 +83132,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[11] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -81971,7 +83170,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #56]\n\t" /* A[4] * B[11] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82011,7 +83210,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[10] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82049,7 +83248,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[9] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82086,7 +83285,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[8] */ "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82123,7 +83322,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[7] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82161,7 +83360,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[6] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82199,7 +83398,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[5] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82237,7 +83436,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[4] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82275,7 +83474,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #60]\n\t" /* A[11] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82315,7 +83514,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[6] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82352,7 +83551,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[7] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82390,7 +83589,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[8] */ "ldr r11, [%[a], #32]\n\t" "ldr r12, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82428,7 +83627,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[9] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82466,7 +83665,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[10] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82504,7 +83703,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[11] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82542,7 +83741,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #64]\n\t" /* A[6] * B[11] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82582,7 +83781,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[10] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82619,7 +83818,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82656,7 +83855,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[8] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82694,7 +83893,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[7] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82732,7 +83931,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[6] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82770,7 +83969,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #68]\n\t" /* A[11] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82809,7 +84008,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[10] * B[8] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82847,7 +84046,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[9] */ "ldr r11, [%[a], #36]\n\t" "ldr r12, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82885,7 +84084,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[10] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82923,7 +84122,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[11] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -82961,7 +84160,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #72]\n\t" /* A[8] * B[11] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83000,7 +84199,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[10] */ "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83037,7 +84236,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[10] * B[9] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83075,7 +84274,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[8] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83112,7 +84311,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r4, [%[r], #76]\n\t" /* A[11] * B[9] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83152,7 +84351,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[10] */ "ldr r11, [%[a], #40]\n\t" "ldr r12, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83190,7 +84389,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[11] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83227,7 +84426,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r5, [%[r], #80]\n\t" /* A[10] * B[11] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83266,7 +84465,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[11] * B[10] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83303,7 +84502,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r3, [%[r], #84]\n\t" /* A[11] * B[11] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -83329,9 +84528,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, r8, r9\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r7\n\t" + "umlal r4, r5, r8, r9\n\t" #endif "str r4, [%[r], #88]\n\t" "str r5, [%[r], #92]\n\t" @@ -83343,7 +84540,7 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "stm %[r]!, {r3, r4, r5, r6}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } @@ -83356,12 +84553,11 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ */ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -83370,7 +84566,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "L_sp_384_sqr_12_outer_%=: \n\t" "subs r3, r5, #44\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_384_sqr_12_inner_%=: \n\t" @@ -83378,7 +84574,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_384_sqr_12_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -83431,7 +84627,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_384_sqr_12_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -83485,7 +84681,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_384_sqr_12_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -83497,14 +84693,14 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) */ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #48\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -83523,7 +84719,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83579,7 +84775,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83633,7 +84829,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83663,7 +84859,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83718,7 +84914,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83771,7 +84967,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83826,7 +85022,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83877,7 +85073,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83907,7 +85103,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83937,7 +85133,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -83975,7 +85171,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84020,7 +85216,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84050,7 +85246,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84088,7 +85284,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84125,7 +85321,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84164,7 +85360,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84194,7 +85390,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84232,7 +85428,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84270,7 +85466,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84315,7 +85511,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84345,7 +85541,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84383,7 +85579,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84421,7 +85617,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84458,7 +85654,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84497,7 +85693,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84527,7 +85723,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84565,7 +85761,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84603,7 +85799,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84641,7 +85837,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84686,7 +85882,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84716,7 +85912,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84754,7 +85950,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84792,7 +85988,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84830,7 +86026,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84867,7 +86063,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84906,7 +86102,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84936,7 +86132,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -84974,7 +86170,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85012,7 +86208,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85050,7 +86246,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85088,7 +86284,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85133,7 +86329,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85163,7 +86359,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85201,7 +86397,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85239,7 +86435,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85277,7 +86473,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85314,7 +86510,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85353,7 +86549,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85383,7 +86579,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85421,7 +86617,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85459,7 +86655,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85497,7 +86693,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85542,7 +86738,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85572,7 +86768,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85610,7 +86806,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85648,7 +86844,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85685,7 +86881,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85724,7 +86920,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85754,7 +86950,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85792,7 +86988,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85830,7 +87026,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85875,7 +87071,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85905,7 +87101,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85943,7 +87139,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -85980,7 +87176,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[8] * A[8] */ "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86019,7 +87215,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86049,7 +87245,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86087,7 +87283,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86132,7 +87328,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86187,7 +87383,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86238,7 +87434,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[9] * A[9] */ "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86268,7 +87464,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86323,7 +87519,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86376,7 +87572,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86430,7 +87626,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) #endif /* A[10] * A[10] */ "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86460,7 +87656,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86515,7 +87711,7 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "str r2, [%[r], #84]\n\t" /* A[11] * A[11] */ "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -86562,9 +87758,9 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -86600,12 +87796,11 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -86627,10 +87822,11 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -86645,9 +87841,9 @@ static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -86682,9 +87878,9 @@ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -86829,14 +88025,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -87025,10 +88221,10 @@ static int sp_384_point_to_ecc_point_12(const sp_point_384* p, ecc_point* pm) */ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -87065,10 +88261,10 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -87125,6 +88321,7 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_ #endif /* WOLFSSL_SP_SMALL */ #define sp_384_mont_reduce_order_12 sp_384_mont_reduce_12 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 384 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -87133,12 +88330,12 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -87151,10 +88348,9 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -87178,14 +88374,8 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -87209,18 +88399,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -87244,17 +88428,11 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -87278,18 +88456,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -87313,18 +88485,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -87348,18 +88514,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -87383,18 +88543,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -87418,18 +88572,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -87453,18 +88601,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -87488,18 +88630,12 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -87523,22 +88659,16 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #44]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #44]\n\t" +#else + "ldr r7, [%[m], #44]\n\t" #endif "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -87569,13 +88699,6 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "ldr r10, [%[a], #48]\n\t" @@ -87587,6 +88710,7 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p "add %[a], %[a], #4\n\t" "cmp r9, #48\n\t" "blt L_sp_384_mont_reduce_12_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "mov %[mp], r3\n\t" @@ -87597,6 +88721,247 @@ static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 384 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_384_mont_reduce_12_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "ldr r10, [%[a], #48]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #48\n\t" + "blt L_sp_384_mont_reduce_12_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 384 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_384_mont_reduce_12_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #48]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #44]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #48]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #48\n\t" + "blt L_sp_384_mont_reduce_12_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -87748,8 +89113,8 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td) */ static sp_int32 sp_384_cmp_12(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -87972,10 +89337,10 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p, */ static void sp_384_mont_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register const sp_digit* m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; sp_digit o; @@ -87991,9 +89356,9 @@ static void sp_384_mont_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_384_mont_dbl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; sp_digit o; @@ -88009,9 +89374,9 @@ static void sp_384_mont_dbl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_384_mont_tpl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; sp_digit o; @@ -88032,10 +89397,10 @@ static void sp_384_mont_tpl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -88072,10 +89437,10 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r8, #0\n\t" @@ -88139,10 +89504,10 @@ static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static void sp_384_mont_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register const sp_digit* m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; sp_digit o; @@ -88150,14 +89515,13 @@ static void sp_384_mont_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi sp_384_cond_add_12(r, r, m, o); } -#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 #ifdef WOLFSSL_SP_SMALL #else #endif /* WOLFSSL_SP_SMALL */ static void sp_384_rshift1_12(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3}\n\t" @@ -88281,7 +89645,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -88403,7 +89767,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -88466,12 +89830,12 @@ static int sp_384_iszero_12(const sp_digit* a) static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*12; + sp_digit* t2 = t + 4*12; + sp_digit* t3 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); @@ -88493,17 +89857,9 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); @@ -88522,20 +89878,31 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_mont_dbl_12(t3, y, p384_mod); sp_384_mont_sub_12(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -88581,12 +89948,12 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*12; - ctx->t3 = t + 4*12; - ctx->t4 = t + 6*12; - ctx->t5 = t + 8*12; - ctx->t6 = t + 10*12; + ctx->t6 = t; + ctx->t1 = t + 2*12; + ctx->t2 = t + 4*12; + ctx->t3 = t + 6*12; + ctx->t4 = t + 8*12; + ctx->t5 = t + 10*12; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -88693,7 +90060,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -88706,22 +90073,28 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -89017,8 +90390,6 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC -#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 -#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -89057,7 +90428,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -89066,8 +90437,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -89087,7 +90458,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -89096,8 +90467,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -89153,12 +90524,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*12; + sp_digit* t6 = t + 4*12; + sp_digit* t1 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -89174,13 +90545,9 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_12(t2, t2, p->x, p384_mod); @@ -89189,33 +90556,40 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(x, t1, t5, p384_mod); - sp_384_mont_dbl_12(t1, t3, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); + sp_384_mont_sqr_12(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(t2, t2, t1, p384_mod); + sp_384_mont_dbl_12(t5, t3, p384_mod); + sp_384_mont_sub_12(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); + sp_384_mont_sub_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(y, t3, t5, p384_mod); + sp_384_mont_mul_12(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -90163,7 +91537,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -91723,7 +93097,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -91784,7 +93158,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, */ static void sp_384_add_one_12(sp_digit* a_p) { - register sp_digit* a asm ("r0") = a_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r1, r2, r3, r4}\n\t" @@ -91902,7 +93276,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -91910,7 +93284,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -92188,16 +93562,15 @@ int sp_ecc_secret_gen_384_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, */ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #48\n\t" "\n" "L_sp_384_sub_in_pkace_12_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -92205,13 +93578,13 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_384_sub_in_pkace_12_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -92224,8 +93597,8 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -92267,15 +93640,14 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) */ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -92308,7 +93680,7 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_384_mul_d_12_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -92353,7 +93725,7 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #48]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -92366,15 +93738,14 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -92399,411 +93770,43 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" "lsr r7, r7, #16\n\t" "mul r7, r6, r7\n\t" "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, #0\n\t" "lsr r7, r8, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" "lsr r6, %[b], #16\n\t" "lsr r7, r8, #16\n\t" "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "add r5, r5, r7\n\t" "lsl r7, r8, #16\n\t" "lsr r7, r7, #16\n\t" "mul r6, r7, r6\n\t" "lsr r7, r6, #16\n\t" "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" + "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -92829,15 +93832,301 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" #endif - "str r5, [%[r]], #4\n\t" + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" "str r3, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -92854,9 +94143,9 @@ static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -92913,9 +94202,9 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -92945,7 +94234,7 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_384_word_12_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -92973,7 +94262,7 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -93001,7 +94290,7 @@ static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -93664,12 +94953,12 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W */ static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( - "ldr r4, [%[a]], #4\n\t" + "ldm %[a]!, {r4}\n\t" "ands r3, r4, #1\n\t" "beq L_sp_384_div2_mod_12_even_%=\n\t" "mov r12, #0\n\t" @@ -93706,8 +94995,8 @@ static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi "stm %[r]!, {r4, r5, r6, r7}\n\t" "\n" "L_sp_384_div2_mod_12_div2_%=: \n\t" - "sub %[r], #48\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub %[r], %[r], #48\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[r]]\n\t" "ldr r9, [%[r], #4]\n\t" #else @@ -93765,7 +95054,7 @@ static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digi ); } -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) static const unsigned char L_sp_384_num_bits_12_table[] = { 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, @@ -93803,7 +95092,8 @@ static const unsigned char L_sp_384_num_bits_12_table[] = { static int sp_384_num_bits_12(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register unsigned char* L_sp_384_num_bits_12_table_c asm ("r1") = (unsigned char*)&L_sp_384_num_bits_12_table; __asm__ __volatile__ ( "mov lr, %[L_sp_384_num_bits_12_table]\n\t" @@ -93813,7 +95103,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_11_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x78\n\t" @@ -93829,7 +95119,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_11_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x70\n\t" @@ -93845,7 +95135,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_11_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x68\n\t" @@ -93858,7 +95148,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "\n" "L_sp_384_num_bits_12_11_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x60\n\t" @@ -93876,7 +95166,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_10_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x58\n\t" @@ -93892,7 +95182,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_10_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x50\n\t" @@ -93908,7 +95198,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_10_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x48\n\t" @@ -93921,7 +95211,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "\n" "L_sp_384_num_bits_12_10_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x40\n\t" @@ -93939,7 +95229,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_9_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x38\n\t" @@ -93955,7 +95245,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_9_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x30\n\t" @@ -93971,7 +95261,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_9_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x28\n\t" @@ -93984,7 +95274,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "\n" "L_sp_384_num_bits_12_9_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x20\n\t" @@ -94002,7 +95292,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_8_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x18\n\t" @@ -94018,7 +95308,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_8_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x10\n\t" @@ -94034,7 +95324,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_384_num_bits_12_8_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x8\n\t" @@ -94047,7 +95337,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "\n" "L_sp_384_num_bits_12_8_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -94367,9 +95657,9 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "\n" "L_sp_384_num_bits_12_13_%=: \n\t" "mov %[a], r12\n\t" - : [a] "+r" (a) - : [L_sp_384_num_bits_12_table] "r" (L_sp_384_num_bits_12_table) - : "memory", "r1", "r2", "r3", "r12", "lr" + : [a] "+r" (a), [L_sp_384_num_bits_12_table] "+r" (L_sp_384_num_bits_12_table_c) + : + : "memory", "r2", "r3", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -94377,13 +95667,13 @@ static int sp_384_num_bits_12(const sp_digit* a_p) #else static int sp_384_num_bits_12(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldr r1, [%[a], #44]\n\t" "cmp r1, #0\n\t" "beq L_sp_384_num_bits_12_11_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x80\n\t" @@ -94398,7 +95688,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "ldr r1, [%[a], #40]\n\t" "cmp r1, #0\n\t" "beq L_sp_384_num_bits_12_10_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x60\n\t" @@ -94413,7 +95703,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "ldr r1, [%[a], #36]\n\t" "cmp r1, #0\n\t" "beq L_sp_384_num_bits_12_9_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x40\n\t" @@ -94428,7 +95718,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "ldr r1, [%[a], #32]\n\t" "cmp r1, #0\n\t" "beq L_sp_384_num_bits_12_8_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x20\n\t" @@ -94443,7 +95733,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) "ldr r1, [%[a], #28]\n\t" "cmp r1, #0\n\t" "beq L_sp_384_num_bits_12_7_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -94523,7 +95813,7 @@ static int sp_384_num_bits_12(const sp_digit* a_p) return (uint32_t)(size_t)a; } -#endif /* WOLFSSL_SP_ARM_ARCH && (WOLFSSL_SP_ARM_ARCH < 7) */ +#endif /* WOLFSSL_ARM_ARCH && (WOLFSSL_ARM_ARCH < 7) */ /* Non-constant time modular inversion. * * @param [out] r Resulting number. @@ -95662,9 +96952,9 @@ static const sp_digit p521_b[17] = { */ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" @@ -95682,7 +96972,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "L_sp_521_mul_17_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -95757,17 +97047,16 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ */ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x44\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95797,7 +97086,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95836,7 +97125,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95874,7 +97163,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95914,7 +97203,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95952,7 +97241,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -95990,7 +97279,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96029,7 +97318,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96066,7 +97355,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96104,7 +97393,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96142,7 +97431,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96181,7 +97470,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96219,7 +97508,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96257,7 +97546,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96295,7 +97584,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96333,7 +97622,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96373,7 +97662,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96410,7 +97699,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96447,7 +97736,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96485,7 +97774,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96523,7 +97812,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96561,7 +97850,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96601,7 +97890,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96638,7 +97927,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96676,7 +97965,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96714,7 +98003,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96752,7 +98041,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96790,7 +98079,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96828,7 +98117,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96868,7 +98157,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96906,7 +98195,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96943,7 +98232,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -96980,7 +98269,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97018,7 +98307,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97056,7 +98345,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97094,7 +98383,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97132,7 +98421,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #28]\n\t" /* A[8] * B[0] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97172,7 +98461,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[1] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97210,7 +98499,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97247,7 +98536,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97285,7 +98574,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97323,7 +98612,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97361,7 +98650,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97399,7 +98688,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97437,7 +98726,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[8] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97475,7 +98764,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #32]\n\t" /* A[0] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97515,7 +98804,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[8] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97553,7 +98842,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97591,7 +98880,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97628,7 +98917,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97665,7 +98954,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97703,7 +98992,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97741,7 +99030,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97779,7 +99068,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[1] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97817,7 +99106,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[0] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97855,7 +99144,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #36]\n\t" /* A[10] * B[0] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97895,7 +99184,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[1] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97933,7 +99222,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[2] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -97971,7 +99260,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[3] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98008,7 +99297,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98046,7 +99335,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98084,7 +99373,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98122,7 +99411,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98160,7 +99449,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[8] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98198,7 +99487,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[9] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98236,7 +99525,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[10] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98274,7 +99563,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #40]\n\t" /* A[0] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98314,7 +99603,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[10] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98352,7 +99641,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[9] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98390,7 +99679,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[8] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98428,7 +99717,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98465,7 +99754,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98502,7 +99791,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98540,7 +99829,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98578,7 +99867,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[3] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98616,7 +99905,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[2] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98654,7 +99943,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[1] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98692,7 +99981,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[0] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98730,7 +100019,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #44]\n\t" /* A[12] * B[0] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98770,7 +100059,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[1] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98808,7 +100097,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[2] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98846,7 +100135,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[3] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98884,7 +100173,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[4] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98921,7 +100210,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[5] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98959,7 +100248,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -98997,7 +100286,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99035,7 +100324,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[8] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99073,7 +100362,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[9] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99111,7 +100400,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[10] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99149,7 +100438,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[11] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99187,7 +100476,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[12] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99225,7 +100514,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #48]\n\t" /* A[0] * B[13] */ "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99265,7 +100554,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[12] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99303,7 +100592,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[11] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99341,7 +100630,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[10] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99379,7 +100668,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[9] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99417,7 +100706,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[8] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99454,7 +100743,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[6] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99491,7 +100780,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99529,7 +100818,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[5] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99567,7 +100856,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[4] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99605,7 +100894,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[3] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99643,7 +100932,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[2] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99681,7 +100970,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[1] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99719,7 +101008,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[0] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99757,7 +101046,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #52]\n\t" /* A[14] * B[0] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99797,7 +101086,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[1] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99835,7 +101124,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[2] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99873,7 +101162,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[3] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99911,7 +101200,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[4] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99949,7 +101238,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[5] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -99986,7 +101275,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[6] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100024,7 +101313,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[7] */ "ldr r11, [%[a], #28]\n\t" "ldr r12, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100062,7 +101351,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[8] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100100,7 +101389,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[9] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100138,7 +101427,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[10] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100176,7 +101465,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[11] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100214,7 +101503,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[12] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100252,7 +101541,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[13] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100290,7 +101579,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[14] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100328,7 +101617,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [sp, #56]\n\t" /* A[0] * B[15] */ "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100368,7 +101657,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[14] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100406,7 +101695,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[13] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100444,7 +101733,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[12] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100482,7 +101771,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[11] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100520,7 +101809,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[10] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100558,7 +101847,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[9] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100595,7 +101884,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[7] * B[8] */ "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100632,7 +101921,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[7] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100670,7 +101959,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[6] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100708,7 +101997,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[5] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100746,7 +102035,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[4] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100784,7 +102073,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[3] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100822,7 +102111,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[2] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100860,7 +102149,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[1] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100898,7 +102187,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[0] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100936,7 +102225,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [sp, #60]\n\t" /* A[16] * B[0] */ "ldr r8, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -100976,7 +102265,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[1] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101014,7 +102303,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[2] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101052,7 +102341,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[3] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101090,7 +102379,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[4] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101128,7 +102417,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[5] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101166,7 +102455,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[6] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101203,7 +102492,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[7] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101241,7 +102530,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[8] */ "ldr r11, [%[a], #32]\n\t" "ldr r12, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101279,7 +102568,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[9] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101317,7 +102606,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[10] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101355,7 +102644,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[11] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101393,7 +102682,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[12] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101431,7 +102720,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[13] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101469,7 +102758,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[14] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101507,7 +102796,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[1] * B[15] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101545,7 +102834,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[0] * B[16] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101583,7 +102872,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [sp, #64]\n\t" /* A[1] * B[16] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101623,7 +102912,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[15] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101661,7 +102950,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[14] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101699,7 +102988,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[13] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101737,7 +103026,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[12] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101775,7 +103064,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[11] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101813,7 +103102,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[10] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101850,7 +103139,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[8] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101887,7 +103176,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[8] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101925,7 +103214,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[7] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -101963,7 +103252,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[6] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102001,7 +103290,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[5] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102039,7 +103328,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[4] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102077,7 +103366,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[3] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102115,7 +103404,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[2] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102153,7 +103442,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[1] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102191,7 +103480,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #68]\n\t" /* A[16] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102231,7 +103520,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[3] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102269,7 +103558,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[4] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102307,7 +103596,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[5] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102345,7 +103634,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[6] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102383,7 +103672,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[7] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102420,7 +103709,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[10] * B[8] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102458,7 +103747,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[9] */ "ldr r11, [%[a], #36]\n\t" "ldr r12, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102496,7 +103785,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[10] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102534,7 +103823,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[11] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102572,7 +103861,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[12] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102610,7 +103899,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[13] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102648,7 +103937,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[14] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102686,7 +103975,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[3] * B[15] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102724,7 +104013,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[2] * B[16] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102762,7 +104051,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #72]\n\t" /* A[3] * B[16] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102802,7 +104091,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[15] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102840,7 +104129,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[14] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102878,7 +104167,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[13] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102916,7 +104205,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[12] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102954,7 +104243,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[11] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -102991,7 +104280,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[9] * B[10] */ "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103028,7 +104317,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[10] * B[9] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103066,7 +104355,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[8] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103104,7 +104393,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[7] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103142,7 +104431,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[6] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103180,7 +104469,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[5] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103218,7 +104507,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[4] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103256,7 +104545,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[3] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103294,7 +104583,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #76]\n\t" /* A[16] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103334,7 +104623,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[5] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103372,7 +104661,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[6] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103410,7 +104699,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[7] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103448,7 +104737,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[8] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103485,7 +104774,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[11] * B[9] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103523,7 +104812,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[10] */ "ldr r11, [%[a], #40]\n\t" "ldr r12, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103561,7 +104850,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[11] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103599,7 +104888,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[12] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103637,7 +104926,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[13] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103675,7 +104964,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[14] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103713,7 +105002,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[5] * B[15] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103751,7 +105040,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[4] * B[16] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103789,7 +105078,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #80]\n\t" /* A[5] * B[16] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103829,7 +105118,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[15] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103867,7 +105156,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[14] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103905,7 +105194,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[13] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103943,7 +105232,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[12] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -103980,7 +105269,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[10] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104017,7 +105306,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[11] * B[10] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104055,7 +105344,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[9] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104093,7 +105382,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[8] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104131,7 +105420,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[7] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104169,7 +105458,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[6] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104207,7 +105496,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[5] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104245,7 +105534,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #84]\n\t" /* A[16] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104285,7 +105574,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[7] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104323,7 +105612,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[8] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104361,7 +105650,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[9] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104398,7 +105687,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[12] * B[10] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104436,7 +105725,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[11] */ "ldr r11, [%[a], #44]\n\t" "ldr r12, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104474,7 +105763,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[12] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104512,7 +105801,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[13] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104550,7 +105839,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[14] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104588,7 +105877,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[7] * B[15] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104626,7 +105915,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[6] * B[16] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104664,7 +105953,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #88]\n\t" /* A[7] * B[16] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104704,7 +105993,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[15] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104742,7 +106031,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[14] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104780,7 +106069,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[13] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104817,7 +106106,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[11] * B[12] */ "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104854,7 +106143,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[12] * B[11] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104892,7 +106181,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[10] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104930,7 +106219,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[9] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -104968,7 +106257,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[8] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105006,7 +106295,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[7] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105044,7 +106333,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #92]\n\t" /* A[16] * B[8] */ "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105084,7 +106373,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[9] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105122,7 +106411,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[10] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105159,7 +106448,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[13] * B[11] */ "ldr r8, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105197,7 +106486,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[12] */ "ldr r11, [%[a], #48]\n\t" "ldr r12, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105235,7 +106524,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[13] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105273,7 +106562,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[14] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105311,7 +106600,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[9] * B[15] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105349,7 +106638,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[8] * B[16] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105387,7 +106676,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #96]\n\t" /* A[9] * B[16] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105427,7 +106716,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[15] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105465,7 +106754,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[14] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105502,7 +106791,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[12] * B[13] */ "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105539,7 +106828,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[13] * B[12] */ "ldr r8, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105577,7 +106866,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[11] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105615,7 +106904,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[10] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105653,7 +106942,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[9] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105691,7 +106980,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #100]\n\t" /* A[16] * B[10] */ "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105731,7 +107020,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[11] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105768,7 +107057,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[14] * B[12] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105806,7 +107095,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[13] */ "ldr r11, [%[a], #52]\n\t" "ldr r12, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105844,7 +107133,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[14] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105882,7 +107171,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[11] * B[15] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105920,7 +107209,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[10] * B[16] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105958,7 +107247,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r5, [%[r], #104]\n\t" /* A[11] * B[16] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -105998,7 +107287,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[15] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106035,7 +107324,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[13] * B[14] */ "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106072,7 +107361,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[14] * B[13] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106110,7 +107399,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[12] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106148,7 +107437,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[11] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106186,7 +107475,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r3, [%[r], #108]\n\t" /* A[16] * B[12] */ "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106225,7 +107514,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[15] * B[13] */ "ldr r8, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106263,7 +107552,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[14] */ "ldr r11, [%[a], #56]\n\t" "ldr r12, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106301,7 +107590,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[13] * B[15] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106339,7 +107628,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[12] * B[16] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106377,7 +107666,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "str r4, [%[r], #112]\n\t" /* A[13] * B[16] */ "ldr r8, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106416,7 +107705,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[14] * B[15] */ "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106453,7 +107742,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[15] * B[14] */ "ldr r8, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106491,7 +107780,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[16] * B[13] */ "ldr r8, [%[a], #64]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106528,7 +107817,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r5, [%[r], #116]\n\t" /* A[16] * B[14] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106568,7 +107857,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[15] * B[15] */ "ldr r11, [%[a], #60]\n\t" "ldr r12, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106606,7 +107895,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ /* A[14] * B[16] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106643,7 +107932,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r3, [%[r], #120]\n\t" /* A[15] * B[16] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106682,7 +107971,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif /* A[16] * B[15] */ "ldr r8, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106719,7 +108008,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ #endif "str r4, [%[r], #124]\n\t" /* A[16] * B[16] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -106745,9 +108034,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r5, r5, r6\n\t" "adc r3, r3, r7\n\t" #else - "umull r6, r7, r8, r9\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r7\n\t" + "umlal r5, r3, r8, r9\n\t" #endif "str r5, [%[r], #128]\n\t" "str r3, [%[r], #132]\n\t" @@ -106763,7 +108050,7 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "stm %[r]!, {r3}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } @@ -106776,12 +108063,11 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ */ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -106790,7 +108076,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "L_sp_521_sqr_17_outer_%=: \n\t" "subs r3, r5, #0x40\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_521_sqr_17_inner_%=: \n\t" @@ -106798,7 +108084,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_521_sqr_17_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -106851,7 +108137,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_521_sqr_17_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -106908,7 +108194,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_521_sqr_17_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -106920,14 +108206,14 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) */ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x44\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -106946,7 +108232,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107002,7 +108288,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107056,7 +108342,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107086,7 +108372,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107141,7 +108427,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107194,7 +108480,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107249,7 +108535,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107300,7 +108586,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107330,7 +108616,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107360,7 +108646,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107398,7 +108684,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107443,7 +108729,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107473,7 +108759,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107511,7 +108797,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107548,7 +108834,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107587,7 +108873,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107617,7 +108903,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107655,7 +108941,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107693,7 +108979,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107738,7 +109024,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107768,7 +109054,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107806,7 +109092,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107844,7 +109130,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107881,7 +109167,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107920,7 +109206,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107950,7 +109236,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -107988,7 +109274,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108026,7 +109312,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108064,7 +109350,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108109,7 +109395,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108139,7 +109425,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108177,7 +109463,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108215,7 +109501,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108253,7 +109539,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108290,7 +109576,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108329,7 +109615,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108359,7 +109645,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108397,7 +109683,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108435,7 +109721,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108473,7 +109759,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108511,7 +109797,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108556,7 +109842,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108586,7 +109872,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108624,7 +109910,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108662,7 +109948,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108700,7 +109986,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108738,7 +110024,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108775,7 +110061,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108814,7 +110100,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108844,7 +110130,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108882,7 +110168,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108920,7 +110206,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108958,7 +110244,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -108996,7 +110282,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109034,7 +110320,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109079,7 +110365,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109109,7 +110395,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109147,7 +110433,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109185,7 +110471,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109223,7 +110509,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109261,7 +110547,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109299,7 +110585,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109336,7 +110622,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109375,7 +110661,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109405,7 +110691,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109443,7 +110729,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109481,7 +110767,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109519,7 +110805,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109557,7 +110843,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109595,7 +110881,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109633,7 +110919,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109678,7 +110964,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109708,7 +110994,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109746,7 +111032,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109784,7 +111070,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109822,7 +111108,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109860,7 +111146,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109898,7 +111184,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109936,7 +111222,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -109973,7 +111259,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[8] * A[8] */ "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110012,7 +111298,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110042,7 +111328,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110080,7 +111366,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110118,7 +111404,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110156,7 +111442,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110194,7 +111480,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110232,7 +111518,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110270,7 +111556,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110315,7 +111601,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110345,7 +111631,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110383,7 +111669,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110421,7 +111707,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110459,7 +111745,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110497,7 +111783,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110535,7 +111821,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110572,7 +111858,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[9] * A[9] */ "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110611,7 +111897,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110641,7 +111927,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110679,7 +111965,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110717,7 +112003,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110755,7 +112041,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110793,7 +112079,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110831,7 +112117,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110876,7 +112162,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110906,7 +112192,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110944,7 +112230,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -110982,7 +112268,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111020,7 +112306,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111058,7 +112344,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111095,7 +112381,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[10] * A[10] */ "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111134,7 +112420,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111164,7 +112450,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111202,7 +112488,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111240,7 +112526,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111278,7 +112564,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111316,7 +112602,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111361,7 +112647,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111391,7 +112677,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111429,7 +112715,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111467,7 +112753,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111505,7 +112791,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111542,7 +112828,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[11] * A[11] */ "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111581,7 +112867,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111611,7 +112897,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111649,7 +112935,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111687,7 +112973,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111725,7 +113011,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111770,7 +113056,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111800,7 +113086,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111838,7 +113124,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111876,7 +113162,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111913,7 +113199,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[12] * A[12] */ "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111952,7 +113238,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -111982,7 +113268,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112020,7 +113306,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112058,7 +113344,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112103,7 +113389,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112133,7 +113419,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112171,7 +113457,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112208,7 +113494,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[13] * A[13] */ "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112247,7 +113533,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112277,7 +113563,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112315,7 +113601,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[13] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112360,7 +113646,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112415,7 +113701,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[13] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112466,7 +113752,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[14] * A[14] */ "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112496,7 +113782,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[13] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112551,7 +113837,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[14] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112604,7 +113890,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[14] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112658,7 +113944,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) #endif /* A[15] * A[15] */ "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112688,7 +113974,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) /* A[15] * A[16] */ "ldr r10, [%[a], #64]\n\t" "ldr r12, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112743,7 +114029,7 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "str r3, [%[r], #124]\n\t" /* A[16] * A[16] */ "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -112794,9 +114080,9 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -112838,12 +114124,11 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -112876,10 +114161,11 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit "ldm %[b]!, {r7}\n\t" "adcs r3, r3, r7\n\t" "stm %[r]!, {r3}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -112894,9 +114180,9 @@ static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -112936,9 +114222,9 @@ static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit */ static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -113010,14 +114296,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -113206,10 +114492,10 @@ static int sp_521_point_to_ecc_point_17(const sp_point_521* p, ecc_point* pm) */ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -113246,10 +114532,10 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -113331,7 +114617,7 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_ */ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x44\n\t" @@ -113402,7 +114688,7 @@ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p /* 12-16 */ "ldm %[a], {r1, r2, r3, r4, r5}\n\t" "ldm sp!, {r7, r8, r9, r10, r11}\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov lr, #0x1\n\t" "lsl lr, lr, #8\n\t" "add lr, lr, #0xff\n\t" @@ -113451,6 +114737,7 @@ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p (void)mp_p; } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 521 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -113459,12 +114746,12 @@ static SP_NOINLINE void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p */ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -113478,7 +114765,7 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "mul r8, %[mp], r12\n\t" "cmp r9, #0x40\n\t" "bne L_sp_521_mont_reduce_order_17_nomask_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r7, #0x1\n\t" "lsl r7, r7, #8\n\t" "add r7, r7, #0xff\n\t" @@ -113489,10 +114776,9 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "\n" "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -113516,15 +114802,9 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "str r12, [%[a]]\n\t" /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -113548,18 +114828,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -113583,17 +114857,11 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113617,18 +114885,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -113652,18 +114914,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113687,18 +114943,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -113722,18 +114972,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113757,18 +115001,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -113792,18 +115030,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113827,18 +115059,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -113862,18 +115088,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113897,18 +115117,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -113932,18 +115146,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -113967,18 +115175,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -114002,18 +115204,12 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -114037,22 +115233,16 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #64]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #64]\n\t" +#else + "ldr r7, [%[m], #64]\n\t" #endif "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -114083,13 +115273,6 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "adds r4, r4, r6\n\t" "adcs r5, r5, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "ldr r10, [%[a], #68]\n\t" @@ -114101,6 +115284,7 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi "add %[a], %[a], #4\n\t" "cmp r9, #0x44\n\t" "blt L_sp_521_mont_reduce_order_17_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "sub %[a], %[a], #4\n\t" @@ -114181,6 +115365,478 @@ static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digi sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 521 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + "cmp r9, #0x40\n\t" + "bne L_sp_521_mont_reduce_order_17_nomask_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r7, #0x1\n\t" + "lsl r7, r7, #8\n\t" + "add r7, r7, #0xff\n\t" +#else + "mov r7, #0x1ff\n\t" +#endif + "and r8, r8, r7\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + "str r12, [%[a]]\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r4, r4, r6\n\t" + "adcs r5, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "ldr r10, [%[a], #68]\n\t" + "adcs r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x44\n\t" + "blt L_sp_521_mont_reduce_order_17_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "sub %[a], %[a], #4\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #4]\n\t" + "ldr r4, [%[a], #8]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #8]\n\t" + "ldr r5, [%[a], #12]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #12]\n\t" + "ldr r4, [%[a], #16]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #16]\n\t" + "ldr r5, [%[a], #20]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #20]\n\t" + "ldr r4, [%[a], #24]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #24]\n\t" + "ldr r5, [%[a], #28]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #28]\n\t" + "ldr r4, [%[a], #32]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #32]\n\t" + "ldr r5, [%[a], #36]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #36]\n\t" + "ldr r4, [%[a], #40]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #40]\n\t" + "ldr r5, [%[a], #44]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #44]\n\t" + "ldr r4, [%[a], #48]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #48]\n\t" + "ldr r5, [%[a], #52]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #52]\n\t" + "ldr r4, [%[a], #56]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #56]\n\t" + "ldr r5, [%[a], #60]\n\t" + "lsr r4, r4, #9\n\t" + "orr r4, r4, r5, lsl #23\n\t" + "str r4, [%[a], #60]\n\t" + "ldr r4, [%[a], #64]\n\t" + "lsr r5, r5, #9\n\t" + "orr r5, r5, r4, lsl #23\n\t" + "str r5, [%[a], #64]\n\t" + "lsr r4, r4, #9\n\t" + "str r4, [%[a], #68]\n\t" + "lsr r3, r4, #9\n\t" + "add %[a], %[a], #4\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 521 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + "cmp r12, #0x40\n\t" + "bne L_sp_521_mont_reduce_order_17_nomask_%=\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "mov r10, #0x1\n\t" + "lsl r10, r10, #8\n\t" + "add r10, r10, #0xff\n\t" +#else + "mov r10, #0x1ff\n\t" +#endif + "and r11, r11, r10\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_nomask_%=: \n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + "str r4, [%[a]]\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #68]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #64]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #68]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x44\n\t" + "blt L_sp_521_mont_reduce_order_17_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "sub %[a], %[a], #4\n\t" + "ldr r10, [%[a]]\n\t" + "ldr r3, [%[a], #4]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #4]\n\t" + "ldr r10, [%[a], #8]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #8]\n\t" + "ldr r3, [%[a], #12]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #12]\n\t" + "ldr r10, [%[a], #16]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #16]\n\t" + "ldr r3, [%[a], #20]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #20]\n\t" + "ldr r10, [%[a], #24]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #24]\n\t" + "ldr r3, [%[a], #28]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #28]\n\t" + "ldr r10, [%[a], #32]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #32]\n\t" + "ldr r3, [%[a], #36]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #36]\n\t" + "ldr r10, [%[a], #40]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #40]\n\t" + "ldr r3, [%[a], #44]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #44]\n\t" + "ldr r10, [%[a], #48]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #48]\n\t" + "ldr r3, [%[a], #52]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #52]\n\t" + "ldr r10, [%[a], #56]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #56]\n\t" + "ldr r3, [%[a], #60]\n\t" + "lsr r10, r10, #9\n\t" + "orr r10, r10, r3, lsl #23\n\t" + "str r10, [%[a], #60]\n\t" + "ldr r10, [%[a], #64]\n\t" + "lsr r3, r3, #9\n\t" + "orr r3, r3, r10, lsl #23\n\t" + "str r3, [%[a], #64]\n\t" + "lsr r10, r10, #9\n\t" + "str r10, [%[a], #68]\n\t" + "lsr lr, r10, #9\n\t" + "add %[a], %[a], #4\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -114329,8 +115985,8 @@ static void sp_521_mont_inv_17(sp_digit* r, const sp_digit* a, sp_digit* td) */ static sp_int32 sp_521_cmp_17(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -114608,9 +116264,9 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p, */ static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -114645,7 +116301,7 @@ static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi "ldm %[a]!, {r8}\n\t" "ldm %[b]!, {r4}\n\t" "adcs r8, r8, r4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r12, #0x1\n\t" "lsl r12, r12, #8\n\t" "add r12, r12, #0xff\n\t" @@ -114694,8 +116350,8 @@ static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_521_mont_dbl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "mov r2, #0\n\t" @@ -114721,7 +116377,7 @@ static void sp_521_mont_dbl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "ldm %[a]!, {r4}\n\t" "adcs r4, r4, r4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x1\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0xff\n\t" @@ -114770,8 +116426,8 @@ static void sp_521_mont_dbl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "mov r2, #0\n\t" @@ -114831,7 +116487,7 @@ static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi "ldm %[r], {r4}\n\t" "ldm %[a]!, {r8}\n\t" "adcs r4, r4, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r3, #0x1\n\t" "lsl r3, r3, #8\n\t" "add r3, r3, #0xff\n\t" @@ -114867,9 +116523,9 @@ static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -114904,7 +116560,7 @@ static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi "ldm %[a]!, {r8}\n\t" "ldm %[b]!, {r4}\n\t" "sbcs r8, r8, r4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r12, #0x1\n\t" "lsl r12, r12, #8\n\t" "add r12, r12, #0xff\n\t" @@ -114946,11 +116602,10 @@ static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi (void)m_p; } -#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 static void sp_521_rshift1_17(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3}\n\t" @@ -115095,7 +116750,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -115217,7 +116872,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -115283,12 +116938,12 @@ static int sp_521_iszero_17(const sp_digit* a) static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*17; + sp_digit* t2 = t + 4*17; + sp_digit* t3 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); @@ -115310,17 +116965,9 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); @@ -115339,20 +116986,31 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_mont_dbl_17(t3, y, p521_mod); sp_521_mont_sub_17(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -115398,12 +117056,12 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*17; - ctx->t3 = t + 4*17; - ctx->t4 = t + 6*17; - ctx->t5 = t + 8*17; - ctx->t6 = t + 10*17; + ctx->t6 = t; + ctx->t1 = t + 2*17; + ctx->t2 = t + 4*17; + ctx->t3 = t + 6*17; + ctx->t4 = t + 8*17; + ctx->t5 = t + 10*17; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -115510,7 +117168,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -115523,22 +117181,28 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -115868,8 +117532,6 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC -#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 -#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -115908,7 +117570,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -115917,8 +117579,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -115938,7 +117600,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -115947,8 +117609,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -116004,12 +117666,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*17; + sp_digit* t6 = t + 4*17; + sp_digit* t1 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -116025,13 +117687,9 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_17(t2, t2, p->x, p521_mod); @@ -116040,33 +117698,40 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(x, t1, t5, p521_mod); - sp_521_mont_dbl_17(t1, t3, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); + sp_521_mont_sqr_17(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(t2, t2, t1, p521_mod); + sp_521_mont_dbl_17(t5, t3, p521_mod); + sp_521_mont_sub_17(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); + sp_521_mont_sub_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(y, t3, t5, p521_mod); + sp_521_mont_mul_17(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -117054,7 +118719,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -119158,7 +120823,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -119219,7 +120884,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, */ static void sp_521_add_one_17(sp_digit* a_p) { - register sp_digit* a asm ("r0") = a_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r1, r2, r3, r4}\n\t" @@ -119347,7 +121012,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -119355,7 +121020,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -119625,13 +121290,13 @@ int sp_ecc_secret_gen_521_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) static void sp_521_rshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #32\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[a]]\n\t" "ldr r5, [%[a], #4]\n\t" #else @@ -119716,7 +121381,7 @@ static void sp_521_rshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) "lsl r3, r5, r12\n\t" "lsr r5, r5, %[n]\n\t" "orr r4, r4, r3\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "str r4, [%[r], #60]\n\t" "str r5, [%[r], #64]\n\t" #else @@ -119734,9 +121399,9 @@ static void sp_521_rshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) static void sp_521_lshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #31\n\t" @@ -119850,9 +121515,9 @@ static void sp_521_lshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) static void sp_521_lshift_34(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register byte n asm ("r2") = n_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; __asm__ __volatile__ ( "rsb r12, %[n], #31\n\t" @@ -120074,16 +121739,15 @@ static void sp_521_lshift_34(sp_digit* r_p, const sp_digit* a_p, byte n_p) */ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x40\n\t" "\n" "L_sp_521_sub_in_pkace_17_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -120091,18 +121755,18 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_521_sub_in_pkace_17_word_%=\n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2}\n\t" "ldm %[b]!, {r6}\n\t" "sbcs r2, r2, r6\n\t" "stm %[a]!, {r2}\n\t" - "sbc %[a], r10, r10\n\t" + "sbc %[a], %[a], %[a]\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -120115,8 +121779,8 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -120169,15 +121833,14 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) */ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -120210,7 +121873,7 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_521_mul_d_17_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -120255,7 +121918,7 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #68]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -120268,15 +121931,14 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -120301,611 +121963,11 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -120931,15 +121993,493 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[2] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" "str r5, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -120956,9 +122496,9 @@ static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -121015,9 +122555,9 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -121047,7 +122587,7 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_521_word_17_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -121075,7 +122615,7 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -121103,7 +122643,7 @@ static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -121796,12 +123336,12 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W */ static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( - "ldr r4, [%[a]], #4\n\t" + "ldm %[a]!, {r4}\n\t" "ands r3, r4, #1\n\t" "beq L_sp_521_div2_mod_17_even_%=\n\t" "mov r12, #0\n\t" @@ -121853,8 +123393,8 @@ static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi "stm %[r]!, {r4}\n\t" "\n" "L_sp_521_div2_mod_17_div2_%=: \n\t" - "sub %[r], #0x44\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) + "sub %[r], %[r], #0x44\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r8, [%[r]]\n\t" "ldr r9, [%[r], #4]\n\t" #else @@ -121932,7 +123472,7 @@ static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digi ); } -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) static const unsigned char L_sp_521_num_bits_17_table[] = { 0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, @@ -121970,7 +123510,8 @@ static const unsigned char L_sp_521_num_bits_17_table[] = { static int sp_521_num_bits_17(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register unsigned char* L_sp_521_num_bits_17_table_c asm ("r1") = (unsigned char*)&L_sp_521_num_bits_17_table; __asm__ __volatile__ ( "mov lr, %[L_sp_521_num_bits_17_table]\n\t" @@ -121980,7 +123521,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_16_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x18\n\t" @@ -121996,7 +123537,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_16_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x10\n\t" @@ -122012,7 +123553,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_16_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x8\n\t" @@ -122025,7 +123566,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_16_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -122043,7 +123584,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_15_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xf8\n\t" @@ -122059,7 +123600,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_15_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xf0\n\t" @@ -122075,7 +123616,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_15_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xe8\n\t" @@ -122088,7 +123629,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_15_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xe0\n\t" @@ -122106,7 +123647,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_14_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xd8\n\t" @@ -122122,7 +123663,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_14_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xd0\n\t" @@ -122138,7 +123679,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_14_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xc8\n\t" @@ -122151,7 +123692,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_14_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xc0\n\t" @@ -122169,7 +123710,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_13_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xb8\n\t" @@ -122185,7 +123726,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_13_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xb0\n\t" @@ -122201,7 +123742,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_13_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xa8\n\t" @@ -122214,7 +123755,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_13_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xa0\n\t" @@ -122232,7 +123773,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_12_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x98\n\t" @@ -122248,7 +123789,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_12_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x90\n\t" @@ -122264,7 +123805,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_12_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x88\n\t" @@ -122277,7 +123818,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_12_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x80\n\t" @@ -122295,7 +123836,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_11_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x78\n\t" @@ -122311,7 +123852,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_11_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x70\n\t" @@ -122327,7 +123868,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_11_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x68\n\t" @@ -122340,7 +123881,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_11_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x60\n\t" @@ -122358,7 +123899,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_10_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x58\n\t" @@ -122374,7 +123915,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_10_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x50\n\t" @@ -122390,7 +123931,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_10_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x48\n\t" @@ -122403,7 +123944,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_10_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x40\n\t" @@ -122421,7 +123962,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_9_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x38\n\t" @@ -122437,7 +123978,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_9_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x30\n\t" @@ -122453,7 +123994,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_9_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x28\n\t" @@ -122466,7 +124007,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_9_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x20\n\t" @@ -122484,7 +124025,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "lsr r3, r1, #24\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_8_3_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x18\n\t" @@ -122500,7 +124041,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_8_2_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x10\n\t" @@ -122516,7 +124057,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "and r3, r3, #0xff\n\t" "cmp r3, #0\n\t" "beq L_sp_521_num_bits_17_8_1_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x8\n\t" @@ -122529,7 +124070,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_8_1_%=: \n\t" "and r3, r1, #0xff\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -122849,9 +124390,9 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "\n" "L_sp_521_num_bits_17_18_%=: \n\t" "mov %[a], r12\n\t" - : [a] "+r" (a) - : [L_sp_521_num_bits_17_table] "r" (L_sp_521_num_bits_17_table) - : "memory", "r1", "r2", "r3", "r12", "lr" + : [a] "+r" (a), [L_sp_521_num_bits_17_table] "+r" (L_sp_521_num_bits_17_table_c) + : + : "memory", "r2", "r3", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -122859,13 +124400,13 @@ static int sp_521_num_bits_17(const sp_digit* a_p) #else static int sp_521_num_bits_17(const sp_digit* a_p) { - register const sp_digit* a asm ("r0") = a_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldr r1, [%[a], #64]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_16_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x20\n\t" @@ -122880,7 +124421,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #60]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_15_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x2\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -122895,7 +124436,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #56]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_14_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xe0\n\t" @@ -122910,7 +124451,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #52]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_13_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xc0\n\t" @@ -122925,7 +124466,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #48]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_12_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0xa0\n\t" @@ -122940,7 +124481,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #44]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_11_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x80\n\t" @@ -122955,7 +124496,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #40]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_10_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x60\n\t" @@ -122970,7 +124511,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #36]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_9_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x40\n\t" @@ -122985,7 +124526,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #32]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_8_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x20\n\t" @@ -123000,7 +124541,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) "ldr r1, [%[a], #28]\n\t" "cmp r1, #0\n\t" "beq L_sp_521_num_bits_17_7_%=\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "mov r2, #0x1\n\t" "lsl r2, r2, #8\n\t" "add r2, r2, #0x0\n\t" @@ -123080,7 +124621,7 @@ static int sp_521_num_bits_17(const sp_digit* a_p) return (uint32_t)(size_t)a; } -#endif /* WOLFSSL_SP_ARM_ARCH && (WOLFSSL_SP_ARM_ARCH < 7) */ +#endif /* WOLFSSL_ARM_ARCH && (WOLFSSL_ARM_ARCH < 7) */ /* Non-constant time modular inversion. * * @param [out] r Resulting number. @@ -124102,17 +125643,16 @@ typedef struct sp_point_1024 { */ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r10, #0\n\t" /* A[0] * B[0] */ "ldr r11, [%[a]]\n\t" "ldr r12, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r3, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124142,7 +125682,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp]\n\t" /* A[0] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124181,7 +125721,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[1] * B[0] */ "ldr r8, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124219,7 +125759,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #4]\n\t" /* A[2] * B[0] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124259,7 +125799,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[1] */ "ldr r11, [%[a], #4]\n\t" "ldr r12, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124297,7 +125837,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[2] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124335,7 +125875,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #8]\n\t" /* A[0] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124374,7 +125914,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[1] * B[2] */ "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124411,7 +125951,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[2] * B[1] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124449,7 +125989,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[0] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124487,7 +126027,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #12]\n\t" /* A[4] * B[0] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124526,7 +126066,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[1] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124564,7 +126104,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[2] */ "ldr r11, [%[a], #8]\n\t" "ldr r12, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124602,7 +126142,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[3] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124640,7 +126180,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[4] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124678,7 +126218,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #16]\n\t" /* A[0] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124718,7 +126258,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[4] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124755,7 +126295,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[2] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124792,7 +126332,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[2] */ "ldr r8, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124830,7 +126370,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[1] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124868,7 +126408,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[0] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124906,7 +126446,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #20]\n\t" /* A[6] * B[0] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124946,7 +126486,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[1] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -124983,7 +126523,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[2] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125021,7 +126561,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[3] */ "ldr r11, [%[a], #12]\n\t" "ldr r12, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125059,7 +126599,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[4] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125097,7 +126637,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[5] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125135,7 +126675,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[6] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125173,7 +126713,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #24]\n\t" /* A[0] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125213,7 +126753,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[6] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125251,7 +126791,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[5] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125288,7 +126828,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[3] * B[4] */ "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125325,7 +126865,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[3] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125363,7 +126903,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[2] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125401,7 +126941,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[1] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125439,7 +126979,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[0] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125477,7 +127017,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #28]\n\t" /* A[8] * B[0] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125517,7 +127057,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[1] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125555,7 +127095,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[2] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125592,7 +127132,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[3] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125630,7 +127170,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[4] */ "ldr r11, [%[a], #16]\n\t" "ldr r12, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125668,7 +127208,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[5] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125706,7 +127246,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[6] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125744,7 +127284,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[7] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125782,7 +127322,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[8] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125820,7 +127360,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #32]\n\t" /* A[0] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125860,7 +127400,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[8] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125898,7 +127438,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[7] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125936,7 +127476,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[6] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -125973,7 +127513,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[4] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126010,7 +127550,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[4] */ "ldr r8, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126048,7 +127588,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[3] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126086,7 +127626,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[2] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126124,7 +127664,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[1] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126162,7 +127702,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[0] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126200,7 +127740,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #36]\n\t" /* A[10] * B[0] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126240,7 +127780,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[1] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126278,7 +127818,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[2] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126316,7 +127856,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[3] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126353,7 +127893,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[4] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126391,7 +127931,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[5] */ "ldr r11, [%[a], #20]\n\t" "ldr r12, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126429,7 +127969,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[6] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126467,7 +128007,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[7] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126505,7 +128045,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[8] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126543,7 +128083,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[9] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126581,7 +128121,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[10] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126619,7 +128159,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #40]\n\t" /* A[0] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126659,7 +128199,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[10] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126697,7 +128237,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[9] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126735,7 +128275,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[8] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126773,7 +128313,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[7] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126810,7 +128350,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[5] * B[6] */ "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126847,7 +128387,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[5] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126885,7 +128425,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[4] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126923,7 +128463,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[3] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126961,7 +128501,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[2] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -126999,7 +128539,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[1] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127037,7 +128577,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[0] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127075,7 +128615,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #44]\n\t" /* A[12] * B[0] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127115,7 +128655,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[1] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127153,7 +128693,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[2] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127191,7 +128731,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[3] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127229,7 +128769,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[4] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127266,7 +128806,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[5] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127304,7 +128844,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[6] */ "ldr r11, [%[a], #24]\n\t" "ldr r12, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127342,7 +128882,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[7] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127380,7 +128920,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[8] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127418,7 +128958,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[9] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127456,7 +128996,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[10] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127494,7 +129034,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[11] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127532,7 +129072,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[12] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127570,7 +129110,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #48]\n\t" /* A[0] * B[13] */ "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127610,7 +129150,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[12] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127648,7 +129188,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[11] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127686,7 +129226,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[10] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127724,7 +129264,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[9] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127762,7 +129302,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[8] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127799,7 +129339,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[6] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127836,7 +129376,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[6] */ "ldr r8, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127874,7 +129414,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[5] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127912,7 +129452,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[4] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127950,7 +129490,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[3] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -127988,7 +129528,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[2] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128026,7 +129566,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[1] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128064,7 +129604,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[0] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128102,7 +129642,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [sp, #52]\n\t" /* A[14] * B[0] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128142,7 +129682,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[1] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128180,7 +129720,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[2] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128218,7 +129758,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[3] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128256,7 +129796,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[4] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128294,7 +129834,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[5] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128331,7 +129871,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[6] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128369,7 +129909,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[7] */ "ldr r11, [%[a], #28]\n\t" "ldr r12, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128407,7 +129947,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[8] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128445,7 +129985,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[9] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128483,7 +130023,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[10] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128521,7 +130061,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[11] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128559,7 +130099,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[12] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128597,7 +130137,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[13] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128635,7 +130175,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[0] * B[14] */ "ldr r8, [%[a]]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128673,7 +130213,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [sp, #56]\n\t" /* A[0] * B[15] */ "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128713,7 +130253,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[14] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128751,7 +130291,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[13] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128789,7 +130329,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[12] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128827,7 +130367,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[11] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128865,7 +130405,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[10] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128903,7 +130443,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[9] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128940,7 +130480,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[7] * B[8] */ "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -128977,7 +130517,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[7] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129015,7 +130555,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[6] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129053,7 +130593,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[5] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129091,7 +130631,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[4] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129129,7 +130669,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[3] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129167,7 +130707,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[2] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129205,7 +130745,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[1] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129243,7 +130783,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[0] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129281,7 +130821,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [sp, #60]\n\t" /* A[15] * B[1] */ "ldr r9, [%[b], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129321,7 +130861,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[2] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129359,7 +130899,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[3] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129397,7 +130937,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[4] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129435,7 +130975,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[5] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129473,7 +131013,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[6] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129510,7 +131050,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[7] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129548,7 +131088,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[8] */ "ldr r11, [%[a], #32]\n\t" "ldr r12, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129586,7 +131126,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[9] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129624,7 +131164,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[10] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129662,7 +131202,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[11] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129700,7 +131240,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[12] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129738,7 +131278,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[13] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129776,7 +131316,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[2] * B[14] */ "ldr r8, [%[a], #8]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129814,7 +131354,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[1] * B[15] */ "ldr r8, [%[a], #4]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129852,7 +131392,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #64]\n\t" /* A[2] * B[15] */ "ldr r8, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129892,7 +131432,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[14] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129930,7 +131470,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[13] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -129968,7 +131508,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[12] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130006,7 +131546,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[11] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130044,7 +131584,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[10] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130081,7 +131621,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[8] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130118,7 +131658,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[8] */ "ldr r8, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130156,7 +131696,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[7] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130194,7 +131734,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[6] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130232,7 +131772,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[5] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130270,7 +131810,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[4] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130308,7 +131848,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[3] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130346,7 +131886,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[2] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130384,7 +131924,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #68]\n\t" /* A[15] * B[3] */ "ldr r9, [%[b], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130424,7 +131964,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[4] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130462,7 +132002,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[5] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130500,7 +132040,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[6] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130538,7 +132078,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[7] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130575,7 +132115,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[10] * B[8] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130613,7 +132153,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[9] */ "ldr r11, [%[a], #36]\n\t" "ldr r12, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130651,7 +132191,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[10] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130689,7 +132229,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[11] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130727,7 +132267,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[12] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130765,7 +132305,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[13] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130803,7 +132343,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[4] * B[14] */ "ldr r8, [%[a], #16]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130841,7 +132381,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[3] * B[15] */ "ldr r8, [%[a], #12]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130879,7 +132419,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #72]\n\t" /* A[4] * B[15] */ "ldr r8, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130919,7 +132459,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[14] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130957,7 +132497,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[13] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -130995,7 +132535,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[12] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131033,7 +132573,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[11] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131070,7 +132610,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[9] * B[10] */ "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131107,7 +132647,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[10] * B[9] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131145,7 +132685,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[8] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131183,7 +132723,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[7] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131221,7 +132761,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[6] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131259,7 +132799,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[5] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131297,7 +132837,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[4] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131335,7 +132875,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #76]\n\t" /* A[15] * B[5] */ "ldr r9, [%[b], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131375,7 +132915,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[6] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131413,7 +132953,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[7] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131451,7 +132991,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[8] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131488,7 +133028,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[11] * B[9] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131526,7 +133066,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[10] */ "ldr r11, [%[a], #40]\n\t" "ldr r12, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131564,7 +133104,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[11] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131602,7 +133142,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[12] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131640,7 +133180,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[13] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131678,7 +133218,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[6] * B[14] */ "ldr r8, [%[a], #24]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131716,7 +133256,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[5] * B[15] */ "ldr r8, [%[a], #20]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131754,7 +133294,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #80]\n\t" /* A[6] * B[15] */ "ldr r8, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131794,7 +133334,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[14] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131832,7 +133372,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[13] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131870,7 +133410,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[12] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131907,7 +133447,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[10] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131944,7 +133484,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[11] * B[10] */ "ldr r8, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -131982,7 +133522,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[9] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132020,7 +133560,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[8] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132058,7 +133598,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[7] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132096,7 +133636,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[6] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132134,7 +133674,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #84]\n\t" /* A[15] * B[7] */ "ldr r9, [%[b], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132174,7 +133714,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[8] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132212,7 +133752,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[9] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132249,7 +133789,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[12] * B[10] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132287,7 +133827,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[11] */ "ldr r11, [%[a], #44]\n\t" "ldr r12, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132325,7 +133865,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[12] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132363,7 +133903,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[13] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132401,7 +133941,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[8] * B[14] */ "ldr r8, [%[a], #32]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132439,7 +133979,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[7] * B[15] */ "ldr r8, [%[a], #28]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132477,7 +134017,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #88]\n\t" /* A[8] * B[15] */ "ldr r8, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132517,7 +134057,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[14] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132555,7 +134095,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[13] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132592,7 +134132,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[11] * B[12] */ "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132629,7 +134169,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[12] * B[11] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132667,7 +134207,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[10] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132705,7 +134245,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[9] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132743,7 +134283,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[8] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132781,7 +134321,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #92]\n\t" /* A[15] * B[9] */ "ldr r9, [%[b], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132821,7 +134361,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[10] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132858,7 +134398,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[13] * B[11] */ "ldr r8, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132896,7 +134436,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[12] */ "ldr r11, [%[a], #48]\n\t" "ldr r12, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132934,7 +134474,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[13] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -132972,7 +134512,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[10] * B[14] */ "ldr r8, [%[a], #40]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133010,7 +134550,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[9] * B[15] */ "ldr r8, [%[a], #36]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133048,7 +134588,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r3, [%[r], #96]\n\t" /* A[10] * B[15] */ "ldr r8, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133088,7 +134628,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[14] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133125,7 +134665,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[12] * B[13] */ "ldr r9, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133162,7 +134702,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[13] * B[12] */ "ldr r8, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133200,7 +134740,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[11] */ "ldr r8, [%[a], #56]\n\t" "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133238,7 +134778,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[10] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133276,7 +134816,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r4, [%[r], #100]\n\t" /* A[15] * B[11] */ "ldr r9, [%[b], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133315,7 +134855,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[14] * B[12] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133353,7 +134893,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[13] */ "ldr r11, [%[a], #52]\n\t" "ldr r12, [%[b], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133391,7 +134931,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[12] * B[14] */ "ldr r8, [%[a], #48]\n\t" "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133429,7 +134969,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[11] * B[15] */ "ldr r8, [%[a], #44]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133467,7 +135007,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "str r5, [%[r], #104]\n\t" /* A[12] * B[15] */ "ldr r8, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133506,7 +135046,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[13] * B[14] */ "ldr r9, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133543,7 +135083,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[14] * B[13] */ "ldr r8, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133581,7 +135121,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[15] * B[12] */ "ldr r8, [%[a], #60]\n\t" "ldr r9, [%[b], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133618,7 +135158,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r3, [%[r], #108]\n\t" /* A[15] * B[13] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133658,7 +135198,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[14] * B[14] */ "ldr r11, [%[a], #56]\n\t" "ldr r12, [%[b], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133696,7 +135236,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b /* A[13] * B[15] */ "ldr r8, [%[a], #52]\n\t" "ldr r9, [%[b], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133733,7 +135273,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r4, [%[r], #112]\n\t" /* A[14] * B[15] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r11, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133772,7 +135312,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif /* A[15] * B[14] */ "ldr r8, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r12, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133809,7 +135349,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b #endif "str r5, [%[r], #116]\n\t" /* A[15] * B[15] */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r9, #16\n\t" "lsr r6, r6, #16\n\t" @@ -133835,9 +135375,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r3, r3, r6\n\t" "adc r4, r4, r7\n\t" #else - "umull r6, r7, r8, r9\n\t" - "adds r3, r3, r6\n\t" - "adc r4, r4, r7\n\t" + "umlal r3, r4, r8, r9\n\t" #endif "str r3, [%[r], #120]\n\t" "str r4, [%[r], #124]\n\t" @@ -133851,7 +135389,7 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "stm %[r]!, {r3, r4, r5, r6}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" ); } @@ -133862,14 +135400,14 @@ static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" /* A[0] * A[0] */ "ldr r10, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsr r9, r10, #16\n\t" "lsl r2, r10, #16\n\t" "lsr r2, r2, #16\n\t" @@ -133888,7 +135426,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[1] */ "ldr r10, [%[a], #4]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -133944,7 +135482,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -133998,7 +135536,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[1] * A[1] */ "ldr r10, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134028,7 +135566,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134083,7 +135621,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[2] */ "ldr r10, [%[a], #8]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134136,7 +135674,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134191,7 +135729,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134242,7 +135780,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[2] * A[2] */ "ldr r10, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134272,7 +135810,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134302,7 +135840,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134340,7 +135878,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[3] */ "ldr r10, [%[a], #12]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134385,7 +135923,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134415,7 +135953,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134453,7 +135991,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134490,7 +136028,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[3] * A[3] */ "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134529,7 +136067,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134559,7 +136097,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134597,7 +136135,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134635,7 +136173,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[4] */ "ldr r10, [%[a], #16]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134680,7 +136218,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134710,7 +136248,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134748,7 +136286,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134786,7 +136324,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134823,7 +136361,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[4] * A[4] */ "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134862,7 +136400,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134892,7 +136430,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134930,7 +136468,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -134968,7 +136506,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135006,7 +136544,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[5] */ "ldr r10, [%[a], #20]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135051,7 +136589,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135081,7 +136619,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135119,7 +136657,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135157,7 +136695,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135195,7 +136733,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135232,7 +136770,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[5] * A[5] */ "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135271,7 +136809,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135301,7 +136839,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135339,7 +136877,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135377,7 +136915,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135415,7 +136953,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135453,7 +136991,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[6] */ "ldr r10, [%[a], #24]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135498,7 +137036,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135528,7 +137066,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135566,7 +137104,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135604,7 +137142,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135642,7 +137180,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135680,7 +137218,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135717,7 +137255,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[6] * A[6] */ "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135756,7 +137294,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135786,7 +137324,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135824,7 +137362,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135862,7 +137400,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135900,7 +137438,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135938,7 +137476,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -135976,7 +137514,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[7] */ "ldr r10, [%[a], #28]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136021,7 +137559,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136051,7 +137589,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136089,7 +137627,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136127,7 +137665,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136165,7 +137703,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136203,7 +137741,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136241,7 +137779,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136278,7 +137816,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[7] * A[7] */ "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136317,7 +137855,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[0] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136347,7 +137885,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136385,7 +137923,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136423,7 +137961,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136461,7 +137999,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136499,7 +138037,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136537,7 +138075,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136575,7 +138113,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[8] */ "ldr r10, [%[a], #32]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136620,7 +138158,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[1] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136650,7 +138188,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136688,7 +138226,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136726,7 +138264,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136764,7 +138302,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136802,7 +138340,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136840,7 +138378,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136877,7 +138415,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[8] * A[8] */ "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136916,7 +138454,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[2] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136946,7 +138484,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -136984,7 +138522,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137022,7 +138560,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137060,7 +138598,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137098,7 +138636,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137136,7 +138674,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[9] */ "ldr r10, [%[a], #36]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137181,7 +138719,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[3] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137211,7 +138749,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137249,7 +138787,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137287,7 +138825,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137325,7 +138863,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137363,7 +138901,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137400,7 +138938,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[9] * A[9] */ "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137439,7 +138977,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[4] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137469,7 +139007,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137507,7 +139045,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137545,7 +139083,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137583,7 +139121,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137621,7 +139159,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[10] */ "ldr r10, [%[a], #40]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137666,7 +139204,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[5] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137696,7 +139234,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137734,7 +139272,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137772,7 +139310,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137810,7 +139348,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137847,7 +139385,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[10] * A[10] */ "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137886,7 +139424,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[6] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137916,7 +139454,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137954,7 +139492,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -137992,7 +139530,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138030,7 +139568,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[11] */ "ldr r10, [%[a], #44]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138075,7 +139613,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[7] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138105,7 +139643,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138143,7 +139681,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138181,7 +139719,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138218,7 +139756,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[11] * A[11] */ "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138257,7 +139795,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[8] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138287,7 +139825,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138325,7 +139863,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138363,7 +139901,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[12] */ "ldr r10, [%[a], #48]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138408,7 +139946,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[9] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138438,7 +139976,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138476,7 +140014,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138513,7 +140051,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[12] * A[12] */ "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138552,7 +140090,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[10] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r5, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138582,7 +140120,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138620,7 +140158,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[13] */ "ldr r10, [%[a], #52]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138665,7 +140203,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[11] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138720,7 +140258,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138771,7 +140309,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[13] * A[13] */ "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138801,7 +140339,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[12] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138856,7 +140394,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[13] * A[14] */ "ldr r10, [%[a], #56]\n\t" "ldr r12, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138909,7 +140447,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[13] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138963,7 +140501,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) #endif /* A[14] * A[14] */ "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -138993,7 +140531,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) /* A[14] * A[15] */ "ldr r10, [%[a], #60]\n\t" "ldr r12, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsl r9, r12, #16\n\t" "lsr r8, r8, #16\n\t" @@ -139048,7 +140586,7 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) "str r4, [%[r], #116]\n\t" /* A[15] * A[15] */ "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r8, r10, #16\n\t" "lsr r9, r10, #16\n\t" "lsr r8, r8, #16\n\t" @@ -139095,12 +140633,11 @@ static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) */ static sp_digit sp_1024_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -139129,10 +140666,11 @@ static sp_digit sp_1024_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -139144,8 +140682,8 @@ static sp_digit sp_1024_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3, r4, r5}\n\t" @@ -139220,12 +140758,11 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" "ldm %[a]!, {r3, r4, r5, r6}\n\t" "ldm %[b]!, {r7, r8, r9, r10}\n\t" "adds r3, r3, r7\n\t" @@ -139282,10 +140819,11 @@ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi "adcs r5, r5, r9\n\t" "adcs r6, r6, r10\n\t" "stm %[r]!, {r3, r4, r5, r6}\n\t" - "adc %[r], r12, r12\n\t" + "mov %[r], #0\n\t" + "adc %[r], %[r], #0\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); return (uint32_t)(size_t)r; } @@ -139367,9 +140905,9 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, */ static sp_digit sp_1024_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -139453,9 +140991,9 @@ SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) */ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" @@ -139473,7 +141011,7 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "L_sp_1024_mul_32_inner_%=: \n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[b], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -139543,12 +141081,11 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b */ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r12, #0\n\t" "mov r6, #0\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" @@ -139557,7 +141094,7 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "L_sp_1024_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" "it cc\n\t" - "movcc r3, r12\n\t" + "movcc r3, #0\n\t" "sub r4, r5, r3\n\t" "\n" "L_sp_1024_sqr_32_inner_%=: \n\t" @@ -139565,7 +141102,7 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "beq L_sp_1024_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsl r10, r11, #16\n\t" "lsr r9, r9, #16\n\t" @@ -139618,7 +141155,7 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "\n" "L_sp_1024_sqr_32_op_sqr_%=: \n\t" "ldr lr, [%[a], r3]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" "lsr r10, lr, #16\n\t" "lsr r9, r9, #16\n\t" @@ -139672,7 +141209,7 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "bgt L_sp_1024_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -139770,16 +141307,15 @@ static const sp_point_1024 p1024_base = { */ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" "mov r12, #0\n\t" "add lr, %[a], #0x80\n\t" "\n" "L_sp_1024_sub_in_pkace_32_word_%=: \n\t" - "subs r12, r10, r12\n\t" + "rsbs r12, r12, #0\n\t" "ldm %[a], {r2, r3, r4, r5}\n\t" "ldm %[b]!, {r6, r7, r8, r9}\n\t" "sbcs r2, r2, r6\n\t" @@ -139787,13 +141323,13 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) "sbcs r4, r4, r8\n\t" "sbcs r5, r5, r9\n\t" "stm %[a]!, {r2, r3, r4, r5}\n\t" - "sbc r12, r10, r10\n\t" + "sbc r12, r12, r12\n\t" "cmp %[a], lr\n\t" "bne L_sp_1024_sub_in_pkace_32_word_%=\n\t" "mov %[a], r12\n\t" : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr" ); return (uint32_t)(size_t)a; } @@ -139810,10 +141346,10 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) */ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r6, #0\n\t" @@ -139850,10 +141386,10 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -139987,9 +141523,9 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r3, #0\n\t" @@ -140026,15 +141562,14 @@ static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ "ldr r8, [%[a]]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r5, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -140067,7 +141602,7 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "L_sp_1024_mul_d_32_word_%=: \n\t" /* A[i] * B */ "ldr r8, [%[a], r9]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -140112,7 +141647,7 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "str r3, [%[r], #128]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } @@ -140125,15 +141660,14 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register sp_digit b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; __asm__ __volatile__ ( - "mov r10, #0\n\t" /* A[0] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r3, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -140158,1211 +141692,11 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) #else "umull r3, r4, %[b], r8\n\t" #endif + "stm %[r]!, {r3}\n\t" "mov r5, #0\n\t" - "str r3, [%[r]], #4\n\t" /* A[1] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[2] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[3] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[4] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[5] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[6] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[7] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[8] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[9] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[10] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[11] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[12] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[13] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[14] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[15] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[16] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[17] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[18] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[19] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[20] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[21] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[22] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[23] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[24] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[25] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[26] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[27] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[28] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adcs r5, r5, #0\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "adc r3, r3, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r7\n\t" - "mov r3, #0\n\t" - "adc r3, r3, #0\n\t" -#endif - "str r4, [%[r]], #4\n\t" - /* A[29] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r5, r5, r7\n\t" - "adcs r3, r3, #0\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "adc r4, r4, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r7\n\t" - "mov r4, #0\n\t" - "adc r4, r4, #0\n\t" -#endif - "str r5, [%[r]], #4\n\t" - /* A[30] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) - "lsl r6, %[b], #16\n\t" - "lsl r7, r8, #16\n\t" - "lsr r6, r6, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, #0\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" - "lsr r7, r8, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsr r6, %[b], #16\n\t" - "lsr r7, r8, #16\n\t" - "mul r7, r6, r7\n\t" - "adds r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" - "lsl r7, r8, #16\n\t" - "lsr r7, r7, #16\n\t" - "mul r6, r7, r6\n\t" - "lsr r7, r6, #16\n\t" - "lsl r6, r6, #16\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "adc r5, r5, #0\n\t" -#else - "umull r6, r7, %[b], r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r7\n\t" - "mov r5, #0\n\t" - "adc r5, r5, #0\n\t" -#endif - "str r3, [%[r]], #4\n\t" - /* A[31] * B */ - "ldr r8, [%[a]], #4\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r6, %[b], #16\n\t" "lsl r7, r8, #16\n\t" "lsr r6, r6, #16\n\t" @@ -141388,15 +141722,973 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" #else - "umull r6, r7, %[b], r8\n\t" + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[2] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[3] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[4] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" "adds r4, r4, r6\n\t" "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" #endif - "str r4, [%[r]], #4\n\t" + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[5] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[6] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[7] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[8] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[9] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[10] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[11] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[12] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[13] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[14] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[15] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[16] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[17] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[18] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[19] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[20] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[21] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[22] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[23] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[24] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[25] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[26] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[27] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[28] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" + "mov r3, #0\n\t" + /* A[29] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r5, r5, r7\n\t" + "adc r3, r3, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r3, r3, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r5, r5, r6\n\t" + "adc r3, r3, r7\n\t" +#else + "umlal r5, r3, %[b], r8\n\t" +#endif + "stm %[r]!, {r5}\n\t" + "mov r4, #0\n\t" + /* A[30] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r3, r3, r7\n\t" + "adc r4, r4, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r4, r4, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r3, r3, r6\n\t" + "adc r4, r4, r7\n\t" +#else + "umlal r3, r4, %[b], r8\n\t" +#endif + "stm %[r]!, {r3}\n\t" + "mov r5, #0\n\t" + /* A[31] * B */ + "ldm %[a]!, {r8}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r6, %[b], #16\n\t" + "lsl r7, r8, #16\n\t" + "lsr r6, r6, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r7, r6, r7\n\t" + "adds r4, r4, r7\n\t" + "adc r5, r5, #0\n\t" + "lsr r7, r8, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" + "lsr r6, %[b], #16\n\t" + "lsr r7, r8, #16\n\t" + "mul r7, r6, r7\n\t" + "add r5, r5, r7\n\t" + "lsl r7, r8, #16\n\t" + "lsr r7, r7, #16\n\t" + "mul r6, r7, r6\n\t" + "lsr r7, r6, #16\n\t" + "lsl r6, r6, #16\n\t" + "adds r4, r4, r6\n\t" + "adc r5, r5, r7\n\t" +#else + "umlal r4, r5, %[b], r8\n\t" +#endif + "stm %[r]!, {r4}\n\t" "str r5, [%[r]]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" ); } @@ -141413,9 +142705,9 @@ static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) */ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr r6, %[div], #16\n\t" @@ -141472,9 +142764,9 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) */ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - register sp_digit d1 asm ("r0") = d1_p; - register sp_digit d0 asm ("r1") = d0_p; - register sp_digit div asm ("r2") = div_p; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( "lsr lr, %[div], #1\n\t" @@ -141504,7 +142796,7 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "bpl L_div_1024_word_32_bit_%=\n\t" "add r3, r3, r3\n\t" "add r3, r3, #1\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -141532,7 +142824,7 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -141560,7 +142852,7 @@ static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) "subs r7, %[d0], r4\n\t" "sbc r8, %[d1], r5\n\t" "add r3, r3, r8\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r7, r3, #16\n\t" "lsl r4, %[div], #16\n\t" "lsr r7, r7, #16\n\t" @@ -141638,8 +142930,8 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) */ static sp_int32 sp_1024_cmp_32(const sp_digit* a_p, const sp_digit* b_p) { - register const sp_digit* a asm ("r0") = a_p; - register const sp_digit* b asm ("r1") = b_p; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r2, #-1\n\t" @@ -142170,14 +143462,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -142355,6 +143647,7 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm) return err; } +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) /* Reduce the number back to 1024 bits using Montgomery reduction. * * a A single precision number to reduce in place. @@ -142363,12 +143656,12 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm) */ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - register sp_digit* a asm ("r0") = a_p; - register const sp_digit* m asm ("r1") = m_p; - register sp_digit mp asm ("r2") = mp_p; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) +#if !(defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4)) "ldr r11, [%[m]]\n\t" #endif /* i = 0 */ @@ -142381,10 +143674,9 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ /* mu = a[i] * mp */ "mul r8, %[mp], r12\n\t" /* a[i+0] += m[0] * mu */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m]]\n\t" #endif -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r7, r11, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r7\n\t" @@ -142408,14 +143700,8 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r12, r12, r6\n\t" "adc r5, r5, r7\n\t" -#else - "umull r6, r7, r8, r11\n\t" - "adds r12, r12, r6\n\t" - "adc r5, r7, #0\n\t" -#endif /* a[i+1] += m[1] * mu */ "ldr r7, [%[m], #4]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r10\n\t" @@ -142439,18 +143725,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r4, r4, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r4, r10, #0\n\t" -#endif "mov r12, lr\n\t" "adds r12, r12, r5\n\t" "adc r4, r4, #0\n\t" /* a[i+2] += m[2] * mu */ "ldr r7, [%[m], #8]\n\t" "ldr lr, [%[a], #8]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r10, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r10\n\t" @@ -142474,17 +143754,11 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds lr, lr, r6\n\t" "adc r5, r5, r10\n\t" -#else - "umull r6, r10, r8, r7\n\t" - "adds lr, lr, r6\n\t" - "adc r5, r10, #0\n\t" -#endif "adds lr, lr, r4\n\t" "adc r5, r5, #0\n\t" /* a[i+3] += m[3] * mu */ "ldr r7, [%[m], #12]\n\t" "ldr r10, [%[a], #12]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142508,18 +143782,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #12]\n\t" "adc r4, r4, #0\n\t" /* a[i+4] += m[4] * mu */ "ldr r7, [%[m], #16]\n\t" "ldr r10, [%[a], #16]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142543,18 +143811,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #16]\n\t" "adc r5, r5, #0\n\t" /* a[i+5] += m[5] * mu */ "ldr r7, [%[m], #20]\n\t" "ldr r10, [%[a], #20]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142578,18 +143840,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #20]\n\t" "adc r4, r4, #0\n\t" /* a[i+6] += m[6] * mu */ "ldr r7, [%[m], #24]\n\t" "ldr r10, [%[a], #24]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142613,18 +143869,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #24]\n\t" "adc r5, r5, #0\n\t" /* a[i+7] += m[7] * mu */ "ldr r7, [%[m], #28]\n\t" "ldr r10, [%[a], #28]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142648,18 +143898,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #28]\n\t" "adc r4, r4, #0\n\t" /* a[i+8] += m[8] * mu */ "ldr r7, [%[m], #32]\n\t" "ldr r10, [%[a], #32]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142683,18 +143927,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #32]\n\t" "adc r5, r5, #0\n\t" /* a[i+9] += m[9] * mu */ "ldr r7, [%[m], #36]\n\t" "ldr r10, [%[a], #36]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142718,18 +143956,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #36]\n\t" "adc r4, r4, #0\n\t" /* a[i+10] += m[10] * mu */ "ldr r7, [%[m], #40]\n\t" "ldr r10, [%[a], #40]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142753,18 +143985,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #40]\n\t" "adc r5, r5, #0\n\t" /* a[i+11] += m[11] * mu */ "ldr r7, [%[m], #44]\n\t" "ldr r10, [%[a], #44]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142788,18 +144014,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #44]\n\t" "adc r4, r4, #0\n\t" /* a[i+12] += m[12] * mu */ "ldr r7, [%[m], #48]\n\t" "ldr r10, [%[a], #48]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142823,18 +144043,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #48]\n\t" "adc r5, r5, #0\n\t" /* a[i+13] += m[13] * mu */ "ldr r7, [%[m], #52]\n\t" "ldr r10, [%[a], #52]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142858,18 +144072,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #52]\n\t" "adc r4, r4, #0\n\t" /* a[i+14] += m[14] * mu */ "ldr r7, [%[m], #56]\n\t" "ldr r10, [%[a], #56]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142893,18 +144101,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #56]\n\t" "adc r5, r5, #0\n\t" /* a[i+15] += m[15] * mu */ "ldr r7, [%[m], #60]\n\t" "ldr r10, [%[a], #60]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142928,18 +144130,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #60]\n\t" "adc r4, r4, #0\n\t" /* a[i+16] += m[16] * mu */ "ldr r7, [%[m], #64]\n\t" "ldr r10, [%[a], #64]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -142963,18 +144159,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #64]\n\t" "adc r5, r5, #0\n\t" /* a[i+17] += m[17] * mu */ "ldr r7, [%[m], #68]\n\t" "ldr r10, [%[a], #68]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -142998,18 +144188,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #68]\n\t" "adc r4, r4, #0\n\t" /* a[i+18] += m[18] * mu */ "ldr r7, [%[m], #72]\n\t" "ldr r10, [%[a], #72]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143033,18 +144217,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #72]\n\t" "adc r5, r5, #0\n\t" /* a[i+19] += m[19] * mu */ "ldr r7, [%[m], #76]\n\t" "ldr r10, [%[a], #76]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143068,18 +144246,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #76]\n\t" "adc r4, r4, #0\n\t" /* a[i+20] += m[20] * mu */ "ldr r7, [%[m], #80]\n\t" "ldr r10, [%[a], #80]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143103,18 +144275,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #80]\n\t" "adc r5, r5, #0\n\t" /* a[i+21] += m[21] * mu */ "ldr r7, [%[m], #84]\n\t" "ldr r10, [%[a], #84]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143138,18 +144304,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #84]\n\t" "adc r4, r4, #0\n\t" /* a[i+22] += m[22] * mu */ "ldr r7, [%[m], #88]\n\t" "ldr r10, [%[a], #88]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143173,18 +144333,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #88]\n\t" "adc r5, r5, #0\n\t" /* a[i+23] += m[23] * mu */ "ldr r7, [%[m], #92]\n\t" "ldr r10, [%[a], #92]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143208,18 +144362,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #92]\n\t" "adc r4, r4, #0\n\t" /* a[i+24] += m[24] * mu */ "ldr r7, [%[m], #96]\n\t" "ldr r10, [%[a], #96]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143243,18 +144391,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #96]\n\t" "adc r5, r5, #0\n\t" /* a[i+25] += m[25] * mu */ "ldr r7, [%[m], #100]\n\t" "ldr r10, [%[a], #100]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143278,18 +144420,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #100]\n\t" "adc r4, r4, #0\n\t" /* a[i+26] += m[26] * mu */ "ldr r7, [%[m], #104]\n\t" "ldr r10, [%[a], #104]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143313,18 +144449,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #104]\n\t" "adc r5, r5, #0\n\t" /* a[i+27] += m[27] * mu */ "ldr r7, [%[m], #108]\n\t" "ldr r10, [%[a], #108]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143348,18 +144478,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #108]\n\t" "adc r4, r4, #0\n\t" /* a[i+28] += m[28] * mu */ "ldr r7, [%[m], #112]\n\t" "ldr r10, [%[a], #112]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143383,18 +144507,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #112]\n\t" "adc r5, r5, #0\n\t" /* a[i+29] += m[29] * mu */ "ldr r7, [%[m], #116]\n\t" "ldr r10, [%[a], #116]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r4, r6, r11\n\t" @@ -143418,18 +144536,12 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r4, r4, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r4, r7, #0\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #116]\n\t" "adc r4, r4, #0\n\t" /* a[i+30] += m[30] * mu */ "ldr r7, [%[m], #120]\n\t" "ldr r10, [%[a], #120]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsr r11, r7, #16\n\t" "lsr r6, r8, #16\n\t" "mul r5, r6, r11\n\t" @@ -143453,22 +144565,16 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "lsl r6, r6, #16\n\t" "adds r10, r10, r6\n\t" "adc r5, r5, r11\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r10, r10, r6\n\t" - "adc r5, r7, #0\n\t" -#endif "adds r10, r10, r4\n\t" "str r10, [%[a], #120]\n\t" "adc r5, r5, #0\n\t" /* a[i+31] += m[31] * mu */ -#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)) - "ldr r7, [%[m], #124]\n\t" -#else +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "ldr r11, [%[m], #124]\n\t" +#else + "ldr r7, [%[m], #124]\n\t" #endif "ldr r10, [%[a], #124]\n\t" -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4) "lsl r6, r8, #16\n\t" "lsl r7, r11, #16\n\t" "lsr r6, r6, #16\n\t" @@ -143499,13 +144605,6 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "adds r5, r5, r6\n\t" "adcs r4, r4, r7\n\t" "adc r3, r3, #0\n\t" -#else - "umull r6, r7, r8, r7\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r7, r3\n\t" - "mov r3, #0\n\t" - "adc r3, r3, r3\n\t" -#endif "adds r10, r10, r5\n\t" "str r10, [%[a], #124]\n\t" "ldr r10, [%[a], #128]\n\t" @@ -143517,6 +144616,7 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ "add %[a], %[a], #4\n\t" "cmp r9, #0x80\n\t" "blt L_sp_1024_mont_reduce_32_word_%=\n\t" + /* Loop Done */ "str r12, [%[a]]\n\t" "str lr, [%[a], #4]\n\t" "ldr r6, [%[m], #124]\n\t" @@ -143532,6 +144632,517 @@ static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_ sp_1024_cond_sub_32(a - 32, a, m, mp); } +#elif defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) +/* Reduce the number back to 1024 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + "ldr r11, [%[m]]\n\t" + /* i = 0 */ + "mov r9, #0\n\t" + "mov r3, #0\n\t" + "ldr r12, [%[a]]\n\t" + "ldr lr, [%[a], #4]\n\t" + "\n" + "L_sp_1024_mont_reduce_32_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r8, %[mp], r12\n\t" + /* a[i+0] += m[0] * mu */ + "mov r5, #0\n\t" + "umlal r12, r5, r8, r11\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r7, [%[m], #4]\n\t" + "mov r4, #0\n\t" + "umlal lr, r4, r8, r7\n\t" + "mov r12, lr\n\t" + "adds r12, r12, r5\n\t" + "adc r4, r4, #0\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r7, [%[m], #8]\n\t" + "ldr lr, [%[a], #8]\n\t" + "mov r5, #0\n\t" + "umlal lr, r5, r8, r7\n\t" + "adds lr, lr, r4\n\t" + "adc r5, r5, #0\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r7, [%[m], #12]\n\t" + "ldr r10, [%[a], #12]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #12]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r7, [%[m], #16]\n\t" + "ldr r10, [%[a], #16]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #16]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r7, [%[m], #20]\n\t" + "ldr r10, [%[a], #20]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #20]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r7, [%[m], #24]\n\t" + "ldr r10, [%[a], #24]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #24]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r7, [%[m], #28]\n\t" + "ldr r10, [%[a], #28]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #28]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r7, [%[m], #32]\n\t" + "ldr r10, [%[a], #32]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #32]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r7, [%[m], #36]\n\t" + "ldr r10, [%[a], #36]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #36]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r7, [%[m], #40]\n\t" + "ldr r10, [%[a], #40]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #40]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r7, [%[m], #44]\n\t" + "ldr r10, [%[a], #44]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #44]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r7, [%[m], #48]\n\t" + "ldr r10, [%[a], #48]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #48]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r7, [%[m], #52]\n\t" + "ldr r10, [%[a], #52]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #52]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r7, [%[m], #56]\n\t" + "ldr r10, [%[a], #56]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #56]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r7, [%[m], #60]\n\t" + "ldr r10, [%[a], #60]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #60]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r7, [%[m], #64]\n\t" + "ldr r10, [%[a], #64]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #64]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r7, [%[m], #68]\n\t" + "ldr r10, [%[a], #68]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #68]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r7, [%[m], #72]\n\t" + "ldr r10, [%[a], #72]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #72]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r7, [%[m], #76]\n\t" + "ldr r10, [%[a], #76]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #76]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r7, [%[m], #80]\n\t" + "ldr r10, [%[a], #80]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #80]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r7, [%[m], #84]\n\t" + "ldr r10, [%[a], #84]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #84]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r7, [%[m], #88]\n\t" + "ldr r10, [%[a], #88]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #88]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r7, [%[m], #92]\n\t" + "ldr r10, [%[a], #92]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #92]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r7, [%[m], #96]\n\t" + "ldr r10, [%[a], #96]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #96]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r7, [%[m], #100]\n\t" + "ldr r10, [%[a], #100]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #100]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r7, [%[m], #104]\n\t" + "ldr r10, [%[a], #104]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #104]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r7, [%[m], #108]\n\t" + "ldr r10, [%[a], #108]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #108]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r7, [%[m], #112]\n\t" + "ldr r10, [%[a], #112]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #112]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r7, [%[m], #116]\n\t" + "ldr r10, [%[a], #116]\n\t" + "mov r4, #0\n\t" + "umlal r10, r4, r8, r7\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #116]\n\t" + "adc r4, r4, #0\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r7, [%[m], #120]\n\t" + "ldr r10, [%[a], #120]\n\t" + "mov r5, #0\n\t" + "umlal r10, r5, r8, r7\n\t" + "adds r10, r10, r4\n\t" + "str r10, [%[a], #120]\n\t" + "adc r5, r5, #0\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r7, [%[m], #124]\n\t" + "ldr r10, [%[a], #124]\n\t" + "umull r6, r7, r8, r7\n\t" + "adds r5, r5, r6\n\t" + "adcs r4, r7, r3\n\t" + "mov r3, #0\n\t" + "adc r3, r3, r3\n\t" + "adds r10, r10, r5\n\t" + "str r10, [%[a], #124]\n\t" + "ldr r10, [%[a], #128]\n\t" + "adcs r10, r10, r4\n\t" + "str r10, [%[a], #128]\n\t" + "adc r3, r3, #0\n\t" + /* i += 1 */ + "add r9, r9, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r9, #0x80\n\t" + "blt L_sp_1024_mont_reduce_32_word_%=\n\t" + /* Loop Done */ + "str r12, [%[a]]\n\t" + "str lr, [%[a], #4]\n\t" + "ldr r6, [%[m], #124]\n\t" + "subs r10, r6, r10\n\t" + "neg r3, r3\n\t" + "sbc r10, r10, r10\n\t" + "orr r3, r3, r10\n\t" + "mov %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_1024_cond_sub_32(a - 32, a, m, mp); +} + +#else +/* Reduce the number back to 1024 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static SP_NOINLINE void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "mov r12, #0\n\t" + "mov lr, #0\n\t" + "ldr r4, [%[a]]\n\t" + "ldr r5, [%[a], #4]\n\t" + "ldr r6, [%[a], #8]\n\t" + "ldr r7, [%[a], #12]\n\t" + "ldr r8, [%[a], #16]\n\t" + "\n" + "L_sp_1024_mont_reduce_32_word_%=: \n\t" + /* mu = a[i] * mp */ + "mul r11, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "ldr r10, [%[m]]\n\t" + "mov r3, #0\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+1] += m[1] * mu */ + "ldr r10, [%[m], #4]\n\t" + "mov r4, r5\n\t" + "umaal r4, r3, r11, r10\n\t" + /* a[i+2] += m[2] * mu */ + "ldr r10, [%[m], #8]\n\t" + "mov r5, r6\n\t" + "umaal r5, r3, r11, r10\n\t" + /* a[i+3] += m[3] * mu */ + "ldr r10, [%[m], #12]\n\t" + "mov r6, r7\n\t" + "umaal r6, r3, r11, r10\n\t" + /* a[i+4] += m[4] * mu */ + "ldr r10, [%[m], #16]\n\t" + "mov r7, r8\n\t" + "umaal r7, r3, r11, r10\n\t" + /* a[i+5] += m[5] * mu */ + "ldr r10, [%[m], #20]\n\t" + "ldr r8, [%[a], #20]\n\t" + "umaal r8, r3, r11, r10\n\t" + /* a[i+6] += m[6] * mu */ + "ldr r10, [%[m], #24]\n\t" + "ldr r9, [%[a], #24]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "ldr r10, [%[m], #28]\n\t" + "ldr r9, [%[a], #28]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "ldr r10, [%[m], #32]\n\t" + "ldr r9, [%[a], #32]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "ldr r10, [%[m], #36]\n\t" + "ldr r9, [%[a], #36]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "ldr r10, [%[m], #40]\n\t" + "ldr r9, [%[a], #40]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "ldr r10, [%[m], #44]\n\t" + "ldr r9, [%[a], #44]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "ldr r10, [%[m], #48]\n\t" + "ldr r9, [%[a], #48]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "ldr r10, [%[m], #52]\n\t" + "ldr r9, [%[a], #52]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "ldr r10, [%[m], #56]\n\t" + "ldr r9, [%[a], #56]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "ldr r10, [%[m], #60]\n\t" + "ldr r9, [%[a], #60]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "ldr r10, [%[m], #64]\n\t" + "ldr r9, [%[a], #64]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "ldr r10, [%[m], #68]\n\t" + "ldr r9, [%[a], #68]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "ldr r10, [%[m], #72]\n\t" + "ldr r9, [%[a], #72]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "ldr r10, [%[m], #76]\n\t" + "ldr r9, [%[a], #76]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "ldr r10, [%[m], #80]\n\t" + "ldr r9, [%[a], #80]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "ldr r10, [%[m], #84]\n\t" + "ldr r9, [%[a], #84]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "ldr r10, [%[m], #88]\n\t" + "ldr r9, [%[a], #88]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "ldr r10, [%[m], #92]\n\t" + "ldr r9, [%[a], #92]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "ldr r10, [%[m], #96]\n\t" + "ldr r9, [%[a], #96]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "ldr r10, [%[m], #100]\n\t" + "ldr r9, [%[a], #100]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "ldr r10, [%[m], #104]\n\t" + "ldr r9, [%[a], #104]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "ldr r10, [%[m], #108]\n\t" + "ldr r9, [%[a], #108]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "ldr r10, [%[m], #112]\n\t" + "ldr r9, [%[a], #112]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "ldr r10, [%[m], #116]\n\t" + "ldr r9, [%[a], #116]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "ldr r10, [%[m], #120]\n\t" + "ldr r9, [%[a], #120]\n\t" + "umaal r9, r3, r11, r10\n\t" + "str r9, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "ldr r10, [%[m], #124]\n\t" + "ldr r9, [%[a], #124]\n\t" + "umaal r9, r3, r11, r10\n\t" + "ldr r11, [%[a], #128]\n\t" + "mov r10, #0\n\t" + "umaal r3, r11, r10, r10\n\t" + "str r9, [%[a], #124]\n\t" + "adds r3, r3, lr\n\t" + "adc lr, r11, #0\n\t" + "str r3, [%[a], #128]\n\t" + /* i += 1 */ + "add r12, r12, #4\n\t" + "add %[a], %[a], #4\n\t" + "cmp r12, #0x80\n\t" + "blt L_sp_1024_mont_reduce_32_word_%=\n\t" + /* Loop Done */ + "str r4, [%[a]]\n\t" + "str r5, [%[a], #4]\n\t" + "str r6, [%[a], #8]\n\t" + "str r7, [%[a], #12]\n\t" + "str r8, [%[a], #16]\n\t" + "ldr r10, [%[m], #124]\n\t" + "subs r9, r10, r9\n\t" + "neg lr, lr\n\t" + "sbc r9, r9, r9\n\t" + "orr lr, lr, r9\n\t" + "mov %[mp], lr\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + sp_1024_cond_sub_32(a - 32, a, m, mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -143678,10 +145289,10 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p, */ static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register const sp_digit* m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -143747,7 +145358,7 @@ static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig "neg r12, r12\n\t" "sbc r11, r11, r11\n\t" "sub %[r], %[r], #0x80\n\t" - "orr r12, r11\n\t" + "orr r12, r12, r11\n\t" "ldm %[r], {r4, r5, r6, r7}\n\t" "ldm %[m]!, {r8, r9, r10, r11}\n\t" "and r8, r8, r12\n\t" @@ -143850,9 +145461,9 @@ static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -143902,7 +145513,7 @@ static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig "neg r12, r12\n\t" "sbc r4, r4, r4\n\t" "sub %[r], %[r], #0x80\n\t" - "orr r12, r4\n\t" + "orr r12, r12, r4\n\t" "ldm %[r], {r4, r5, r6, r7}\n\t" "ldm %[m]!, {r8, r9, r10, r11}\n\t" "and r8, r8, r12\n\t" @@ -144005,9 +145616,9 @@ static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* m asm ("r2") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -144057,7 +145668,7 @@ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig "neg r12, r12\n\t" "sbc r4, r4, r4\n\t" "sub %[r], %[r], #0x80\n\t" - "orr r12, r4\n\t" + "orr r12, r12, r4\n\t" "ldm %[r], {r4, r5, r6, r7}\n\t" "ldm %[m]!, {r8, r9, r10, r11}\n\t" "and r8, r8, r12\n\t" @@ -144212,7 +145823,7 @@ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig "neg r12, r12\n\t" "sbc r7, r7, r7\n\t" "sub %[r], %[r], #0x80\n\t" - "orr r12, r7\n\t" + "orr r12, r12, r7\n\t" "ldm %[r], {r4, r5, r6, r7}\n\t" "ldm %[m]!, {r8, r9, r10, r11}\n\t" "and r8, r8, r12\n\t" @@ -144316,10 +145927,10 @@ static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register const sp_digit* m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; __asm__ __volatile__ ( "ldm %[a]!, {r4, r5, r6, r7}\n\t" @@ -144474,7 +146085,6 @@ static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig ); } -#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -144486,10 +146096,10 @@ static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_dig */ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov lr, #0\n\t" @@ -144526,10 +146136,10 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp */ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; - register sp_digit m asm ("r3") = m_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( "mov r8, #0\n\t" @@ -144656,8 +146266,8 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp #endif /* WOLFSSL_SP_SMALL */ static void sp_1024_rshift1_32(sp_digit* r_p, const sp_digit* a_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( "ldm %[a], {r2, r3}\n\t" @@ -144861,7 +146471,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -144983,7 +146593,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -145017,9 +146627,9 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, */ static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "mov r12, #0\n\t" @@ -145054,9 +146664,9 @@ static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digi */ static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - register sp_digit* r asm ("r0") = r_p; - register const sp_digit* a asm ("r1") = a_p; - register const sp_digit* b asm ("r2") = b_p; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( "ldm %[a]!, {r3, r4, r5, r6}\n\t" @@ -145171,12 +146781,12 @@ static int sp_1024_iszero_32(const sp_digit* a) static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*32; + sp_digit* t2 = t + 4*32; + sp_digit* t3 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); @@ -145198,17 +146808,9 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); @@ -145227,20 +146829,31 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_mont_dbl_32(t3, y, p1024_mod); sp_1024_mont_sub_32(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -145286,12 +146899,12 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*32; - ctx->t3 = t + 4*32; - ctx->t4 = t + 6*32; - ctx->t5 = t + 8*32; - ctx->t6 = t + 10*32; + ctx->t6 = t; + ctx->t1 = t + 2*32; + ctx->t2 = t + 4*32; + ctx->t3 = t + 6*32; + ctx->t4 = t + 8*32; + ctx->t5 = t + 10*32; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -145398,7 +147011,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -145411,22 +147024,28 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -145586,8 +147205,6 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) -#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 -#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -145626,7 +147243,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -145635,8 +147252,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -145656,7 +147273,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -145665,8 +147282,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -145722,12 +147339,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*32; + sp_digit* t6 = t + 4*32; + sp_digit* t1 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -145743,13 +147360,9 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); @@ -145758,33 +147371,40 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, t3, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); + sp_1024_mont_sqr_32(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_32(t5, t3, p1024_mod); + sp_1024_mont_sub_32(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + sp_1024_mont_mul_32(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -150218,7 +151838,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 9db1ffca4..0233d835b 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -52,6 +52,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifdef WOLFSSL_SP_ARM64_ASM #define SP_PRINT_NUM(var, name, total, words, bits) \ do { \ @@ -193,14 +202,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -758,7 +767,7 @@ static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 32]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 48]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -906,7 +915,7 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 96]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 112]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -1237,7 +1246,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 224]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 240]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -2489,7 +2498,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -2681,7 +2690,7 @@ static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -3339,7 +3348,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x27, x28, x6\n\t" "ldr x28, [%[a], 128]\n\t" "adcs x28, x28, x8\n\t" @@ -3719,7 +3728,7 @@ static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -4642,7 +4651,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, "ldp x8, x9, [%[a], 248]\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x8, x8, x5\n\t" "str x8, [%[a], 248]\n\t" "adcs x9, x9, x7\n\t" @@ -4939,7 +4948,7 @@ static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -5230,7 +5239,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -7086,14 +7095,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -7475,7 +7484,7 @@ static sp_digit sp_3072_add_6(sp_digit* r, const sp_digit* a, "adcs x4, x4, x8\n\t" "str x3, [%[r], 32]\n\t" "str x4, [%[r], 40]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -7601,7 +7610,7 @@ static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 64]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 80]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -7877,7 +7886,7 @@ static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 160]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 176]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -8318,7 +8327,7 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 352]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 368]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -11292,7 +11301,7 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -11484,7 +11493,7 @@ static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -12354,7 +12363,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, const sp_digit* m, "ldp x8, x9, [%[a], 184]\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x8, x8, x5\n\t" "str x8, [%[a], 184]\n\t" "adcs x9, x9, x7\n\t" @@ -12862,7 +12871,7 @@ static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -13977,7 +13986,7 @@ SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, "ldp x8, x9, [%[a], 376]\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x8, x8, x5\n\t" "str x8, [%[a], 376]\n\t" "adcs x9, x9, x7\n\t" @@ -14370,7 +14379,7 @@ static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -14717,7 +14726,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -16689,14 +16698,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -17234,7 +17243,7 @@ static sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 480]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 496]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -17476,7 +17485,7 @@ static sp_digit sp_4096_add_64(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -18840,7 +18849,7 @@ SP_NOINLINE static void sp_4096_mont_reduce_64(sp_digit* a, const sp_digit* m, "ldp x8, x9, [%[a], 504]\n\t" "adds x5, x5, x6\n\t" "adcs x7, x7, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x8, x8, x5\n\t" "str x8, [%[a], 504]\n\t" "adcs x9, x9, x7\n\t" @@ -19329,7 +19338,7 @@ static sp_digit sp_4096_sub_64(sp_digit* r, const sp_digit* a, #endif /* WOLFSSL_SP_SMALL */ /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -19732,7 +19741,7 @@ static sp_digit sp_4096_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -21864,112 +21873,101 @@ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b) */ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b) { - sp_digit tmp[4]; - __asm__ __volatile__ ( - "ldp x16, x17, [%[a], 0]\n\t" - "ldp x21, x22, [%[b], 0]\n\t" - "# A[0] * B[0]\n\t" - "mul x8, x16, x21\n\t" - "ldr x19, [%[a], 16]\n\t" - "umulh x9, x16, x21\n\t" - "ldr x23, [%[b], 16]\n\t" - "# A[0] * B[1]\n\t" - "mul x4, x16, x22\n\t" - "ldr x20, [%[a], 24]\n\t" - "umulh x5, x16, x22\n\t" - "ldr x24, [%[b], 24]\n\t" - "adds x9, x9, x4\n\t" - "# A[1] * B[0]\n\t" - "mul x4, x17, x21\n\t" - "adc x10, xzr, x5\n\t" - "umulh x5, x17, x21\n\t" - "adds x9, x9, x4\n\t" - "# A[0] * B[2]\n\t" - "mul x4, x16, x23\n\t" - "adcs x10, x10, x5\n\t" - "umulh x5, x16, x23\n\t" - "adc x11, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[1] * B[1]\n\t" - "mul x4, x17, x22\n\t" - "adc x11, x11, x5\n\t" - "umulh x5, x17, x22\n\t" - "adds x10, x10, x4\n\t" - "# A[2] * B[0]\n\t" - "mul x4, x19, x21\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x19, x21\n\t" - "adc x12, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[0] * B[3]\n\t" - "mul x4, x16, x24\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x16, x24\n\t" - "adc x12, x12, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * B[2]\n\t" - "mul x4, x17, x23\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x23\n\t" - "adc x13, xzr, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[2] * B[1]\n\t" - "mul x4, x19, x22\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x19, x22\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[3] * B[0]\n\t" - "mul x4, x20, x21\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x20, x21\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * B[3]\n\t" - "mul x4, x17, x24\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x24\n\t" - "adc x13, x13, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * B[2]\n\t" - "mul x4, x19, x23\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x19, x23\n\t" - "adc x14, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[3] * B[1]\n\t" - "mul x4, x20, x22\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x20, x22\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * B[3]\n\t" - "mul x4, x19, x24\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x19, x24\n\t" - "adc x14, x14, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[3] * B[2]\n\t" - "mul x4, x20, x23\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x20, x23\n\t" - "adc x15, xzr, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[3] * B[3]\n\t" - "mul x4, x20, x24\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x20, x24\n\t" - "adc x15, x15, xzr\n\t" - "adds x14, x14, x4\n\t" - "adc x15, x15, x5\n\t" - "stp x8, x9, [%[r], 0]\n\t" - "stp x10, x11, [%[r], 16]\n\t" - "stp x12, x13, [%[r], 32]\n\t" - "stp x14, x15, [%[r], 48]\n\t" + "ldp x13, x14, [%[a], 0]\n\t" + "ldp x15, x16, [%[a], 16]\n\t" + "ldp x17, x19, [%[b], 0]\n\t" + "ldp x20, x21, [%[b], 16]\n\t" + "# A[0] * B[0]\n\t" + "umulh x6, x13, x17\n\t" + "mul x5, x13, x17\n\t" + "# A[2] * B[0]\n\t" + "umulh x8, x15, x17\n\t" + "mul x7, x15, x17\n\t" + "# A[1] * B[0]\n\t" + "mul x3, x14, x17\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x7, x7, x4\n\t" + "adc x8, x8, xzr\n\t" + "# A[0] * B[2]\n\t" + "mul x3, x13, x20\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x13, x20\n\t" + "adcs x8, x8, x4\n\t" + "# A[1] * B[3]\n\t" + "mul x9, x14, x21\n\t" + "adcs x9, x9, xzr\n\t" + "umulh x10, x14, x21\n\t" + "adc x10, x10, xzr\n\t" + "# A[0] * B[1]\n\t" + "mul x3, x13, x19\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x13, x19\n\t" + "adcs x7, x7, x4\n\t" + "# A[2] * B[1]\n\t" + "mul x3, x15, x19\n\t" + "adcs x8, x8, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + "# A[1] * B[2]\n\t" + "mul x3, x14, x20\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, xzr, xzr\n\t" + "# A[1] * B[1]\n\t" + "mul x3, x14, x19\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x19\n\t" + "adcs x8, x8, x4\n\t" + "# A[3] * B[1]\n\t" + "mul x3, x16, x19\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x19\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + "# A[2] * B[2]\n\t" + "mul x3, x15, x20\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x10, x10, x4\n\t" + "# A[3] * B[3]\n\t" + "mul x3, x16, x21\n\t" + "adcs x11, x11, x3\n\t" + "umulh x12, x16, x21\n\t" + "adc x12, x12, xzr\n\t" + "# A[0] * B[3]\n\t" + "mul x3, x13, x21\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x13, x21\n\t" + "adcs x9, x9, x4\n\t" + "# A[2] * B[3]\n\t" + "mul x3, x15, x21\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + "# A[3] * B[0]\n\t" + "mul x3, x16, x17\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x9, x9, x4\n\t" + "# A[3] * B[2]\n\t" + "mul x3, x16, x20\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + "stp x5, x6, [%[r], 0]\n\t" + "stp x7, x8, [%[r], 16]\n\t" + "stp x9, x10, [%[r], 32]\n\t" + "stp x11, x12, [%[r], 48]\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "x4", "x5", "x6", "x7", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "cc" + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "cc" ); } @@ -21982,72 +21980,68 @@ static void sp_256_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b) static void sp_256_sqr_4(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( - "ldp x16, x17, [%[a], 0]\n\t" - "# A[0] * A[1]\n\t" - "mul x9, x16, x17\n\t" - "ldr x19, [%[a], 16]\n\t" - "umulh x10, x16, x17\n\t" - "ldr x20, [%[a], 24]\n\t" - "# A[0] * A[2]\n\t" - "mul x4, x16, x19\n\t" - "umulh x5, x16, x19\n\t" - "adds x10, x10, x4\n\t" - "# A[0] * A[3]\n\t" - "mul x4, x16, x20\n\t" - "adc x11, xzr, x5\n\t" - "umulh x5, x16, x20\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * A[2]\n\t" - "mul x4, x17, x19\n\t" - "adc x12, xzr, x5\n\t" - "umulh x5, x17, x19\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * A[3]\n\t" - "mul x4, x17, x20\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x20\n\t" - "adc x13, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * A[3]\n\t" - "mul x4, x19, x20\n\t" - "adc x13, x13, x5\n\t" - "umulh x5, x19, x20\n\t" - "adds x13, x13, x4\n\t" - "adc x14, xzr, x5\n\t" + "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "# A[0] * A[1]\n\t" + "umulh x6, x12, x13\n\t" + "mul x5, x12, x13\n\t" + "# A[0] * A[3]\n\t" + "umulh x8, x12, x15\n\t" + "mul x7, x12, x15\n\t" + "# A[0] * A[2]\n\t" + "mul x2, x12, x14\n\t" + "adds x6, x6, x2\n\t" + "umulh x3, x12, x14\n\t" + "adcs x7, x7, x3\n\t" + "# A[1] * A[3]\n\t" + "mul x2, x13, x15\n\t" + "adcs x8, x8, x2\n\t" + "umulh x9, x13, x15\n\t" + "adc x9, x9, xzr\n\t" + "# A[1] * A[2]\n\t" + "mul x2, x13, x14\n\t" + "adds x7, x7, x2\n\t" + "umulh x3, x13, x14\n\t" + "adcs x8, x8, x3\n\t" + "# A[2] * A[3]\n\t" + "mul x2, x14, x15\n\t" + "adcs x9, x9, x2\n\t" + "umulh x10, x14, x15\n\t" + "adc x10, x10, xzr\n\t" "# Double\n\t" - "adds x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adcs x13, x13, x13\n\t" - "# A[0] * A[0]\n\t" - "mul x8, x16, x16\n\t" - "adcs x14, x14, x14\n\t" - "umulh x3, x16, x16\n\t" - "cset x15, cs\n\t" - "# A[1] * A[1]\n\t" - "mul x4, x17, x17\n\t" - "adds x9, x9, x3\n\t" - "umulh x5, x17, x17\n\t" - "adcs x10, x10, x4\n\t" - "# A[2] * A[2]\n\t" - "mul x6, x19, x19\n\t" - "adcs x11, x11, x5\n\t" - "umulh x7, x19, x19\n\t" - "adcs x12, x12, x6\n\t" - "# A[3] * A[3]\n\t" - "mul x16, x20, x20\n\t" - "adcs x13, x13, x7\n\t" - "umulh x17, x20, x20\n\t" - "adcs x14, x14, x16\n\t" - "adc x15, x15, x17\n\t" - "stp x8, x9, [%[r], 0]\n\t" - "stp x10, x11, [%[r], 16]\n\t" - "stp x12, x13, [%[r], 32]\n\t" - "stp x14, x15, [%[r], 48]\n\t" + "adc x11, xzr, xzr\n\t" + "# A[0] * A[0]\n\t" + "umulh x3, x12, x12\n\t" + "mul x4, x12, x12\n\t" + "# A[1] * A[1]\n\t" + "mul x2, x13, x13\n\t" + "adds x5, x5, x3\n\t" + "umulh x3, x13, x13\n\t" + "adcs x6, x6, x2\n\t" + "# A[2] * A[2]\n\t" + "mul x2, x14, x14\n\t" + "adcs x7, x7, x3\n\t" + "umulh x3, x14, x14\n\t" + "adcs x8, x8, x2\n\t" + "# A[3] * A[3]\n\t" + "mul x2, x15, x15\n\t" + "adcs x9, x9, x3\n\t" + "umulh x3, x15, x15\n\t" + "adcs x10, x10, x2\n\t" + "adc x11, x11, x3\n\t" + "stp x4, x5, [%[r], 0]\n\t" + "stp x6, x7, [%[r], 16]\n\t" + "stp x8, x9, [%[r], 32]\n\t" + "stp x10, x11, [%[r], 48]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "cc" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "cc" ); } @@ -22071,7 +22065,7 @@ static sp_digit sp_256_add_4(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 0]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 16]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -22198,14 +22192,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -22426,181 +22420,171 @@ static void sp_256_cond_copy_4(sp_digit* r, const sp_digit* a, sp_digit m) * m Modulus (prime). * mp Montgomery multiplier. */ -SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b, +static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m, sp_digit mp) { (void)m; (void)mp; __asm__ __volatile__ ( - "ldp x16, x17, [%[a], 0]\n\t" - "ldp x21, x22, [%[b], 0]\n\t" - "# A[0] * B[0]\n\t" - "mul x8, x16, x21\n\t" - "ldr x19, [%[a], 16]\n\t" - "umulh x9, x16, x21\n\t" - "ldr x23, [%[b], 16]\n\t" - "# A[0] * B[1]\n\t" - "mul x4, x16, x22\n\t" - "ldr x20, [%[a], 24]\n\t" - "umulh x5, x16, x22\n\t" - "ldr x24, [%[b], 24]\n\t" - "adds x9, x9, x4\n\t" - "# A[1] * B[0]\n\t" - "mul x4, x17, x21\n\t" - "adc x10, xzr, x5\n\t" - "umulh x5, x17, x21\n\t" - "adds x9, x9, x4\n\t" - "# A[0] * B[2]\n\t" - "mul x4, x16, x23\n\t" - "adcs x10, x10, x5\n\t" - "umulh x5, x16, x23\n\t" - "adc x11, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[1] * B[1]\n\t" - "mul x4, x17, x22\n\t" - "adc x11, x11, x5\n\t" - "umulh x5, x17, x22\n\t" - "adds x10, x10, x4\n\t" - "# A[2] * B[0]\n\t" - "mul x4, x19, x21\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x19, x21\n\t" - "adc x12, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" - "# A[0] * B[3]\n\t" - "mul x4, x16, x24\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x16, x24\n\t" - "adc x12, x12, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * B[2]\n\t" - "mul x4, x17, x23\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x23\n\t" - "adc x13, xzr, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[2] * B[1]\n\t" - "mul x4, x19, x22\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x19, x22\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[3] * B[0]\n\t" - "mul x4, x20, x21\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x20, x21\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * B[3]\n\t" - "mul x4, x17, x24\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x24\n\t" - "adc x13, x13, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * B[2]\n\t" - "mul x4, x19, x23\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x19, x23\n\t" - "adc x14, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[3] * B[1]\n\t" - "mul x4, x20, x22\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x20, x22\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * B[3]\n\t" - "mul x4, x19, x24\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x19, x24\n\t" - "adc x14, x14, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[3] * B[2]\n\t" - "mul x4, x20, x23\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x20, x23\n\t" - "adc x15, xzr, xzr\n\t" - "adds x13, x13, x4\n\t" - "# A[3] * B[3]\n\t" - "mul x4, x20, x24\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x20, x24\n\t" - "adc x15, x15, xzr\n\t" - "adds x14, x14, x4\n\t" - "mov x4, x8\n\t" - "adc x15, x15, x5\n\t" + "ldp x13, x14, [%[a], 0]\n\t" + "ldp x15, x16, [%[a], 16]\n\t" + "ldp x17, x19, [%[b], 0]\n\t" + "ldp x20, x21, [%[b], 16]\n\t" + "# A[0] * B[0]\n\t" + "umulh x6, x13, x17\n\t" + "mul x5, x13, x17\n\t" + "# A[2] * B[0]\n\t" + "umulh x8, x15, x17\n\t" + "mul x7, x15, x17\n\t" + "# A[1] * B[0]\n\t" + "mul x3, x14, x17\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x7, x7, x4\n\t" + "adc x8, x8, xzr\n\t" + "# A[0] * B[2]\n\t" + "mul x3, x13, x20\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x13, x20\n\t" + "adcs x8, x8, x4\n\t" + "# A[1] * B[3]\n\t" + "mul x9, x14, x21\n\t" + "adcs x9, x9, xzr\n\t" + "umulh x10, x14, x21\n\t" + "adc x10, x10, xzr\n\t" + "# A[0] * B[1]\n\t" + "mul x3, x13, x19\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x13, x19\n\t" + "adcs x7, x7, x4\n\t" + "# A[2] * B[1]\n\t" + "mul x3, x15, x19\n\t" + "adcs x8, x8, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + "# A[1] * B[2]\n\t" + "mul x3, x14, x20\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, xzr, xzr\n\t" + "# A[1] * B[1]\n\t" + "mul x3, x14, x19\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x19\n\t" + "adcs x8, x8, x4\n\t" + "# A[3] * B[1]\n\t" + "mul x3, x16, x19\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x19\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + "# A[2] * B[2]\n\t" + "mul x3, x15, x20\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x10, x10, x4\n\t" + "# A[3] * B[3]\n\t" + "mul x3, x16, x21\n\t" + "adcs x11, x11, x3\n\t" + "umulh x12, x16, x21\n\t" + "adc x12, x12, xzr\n\t" + "# A[0] * B[3]\n\t" + "mul x3, x13, x21\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x13, x21\n\t" + "adcs x9, x9, x4\n\t" + "# A[2] * B[3]\n\t" + "mul x3, x15, x21\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + "# A[3] * B[0]\n\t" + "mul x3, x16, x17\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x9, x9, x4\n\t" + "# A[3] * B[2]\n\t" + "mul x3, x16, x20\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x11, x11, x4\n\t" + "mov x3, x5\n\t" + "adc x12, x12, xzr\n\t" "# Start Reduction\n\t" - "mov x5, x9\n\t" - "mov x6, x10\n\t" + "mov x4, x6\n\t" + "mov x13, x7\n\t" "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t" "# - a[0] << 32 << 192\n\t" "# + (a[0] * 2) << 192\n\t" "# a[0]-a[2] << 32\n\t" - "extr x22, x10, x9, 32\n\t" - "add x7, x11, x8\n\t" - "extr x21, x9, x8, 32\n\t" - "add x7, x7, x8\n\t" + "lsl x15, x5, #32\n\t" + "extr x17, x7, x6, 32\n\t" + "add x14, x8, x5\n\t" + "extr x16, x6, x5, 32\n\t" + "add x14, x14, x5\n\t" "# + a[0]-a[2] << 32 << 64\n\t" "# - a[0] << 32 << 192\n\t" - "adds x5, x5, x8, lsl #32\n\t" - "sub x7, x7, x8, lsl #32\n\t" - "adcs x6, x6, x21\n\t" - "adc x7, x7, x22\n\t" + "adds x4, x4, x15\n\t" + "sub x14, x14, x15\n\t" + "adcs x13, x13, x16\n\t" + "adc x14, x14, x17\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += mu << 256\n\t" - "adds x12, x12, x4\n\t" - "adcs x13, x13, x5\n\t" - "adcs x14, x14, x6\n\t" - "adcs x15, x15, x7\n\t" - "cset x8, cs\n\t" + "adds x9, x9, x3\n\t" + "adcs x10, x10, x4\n\t" + "adcs x11, x11, x13\n\t" + "adcs x12, x12, x14\n\t" + "adc x5, xzr, xzr\n\t" "# a += mu << 192\n\t" "# mu <<= 32\n\t" "# a += (mu << 32) << 64\n\t" - "adds x11, x11, x4\n\t" - "adcs x12, x12, x5\n\t" - "adcs x13, x13, x6\n\t" - "lsr x16, x7, 32\n\t" - "adcs x14, x14, x7\n\t" - "extr x7, x7, x6, 32\n\t" - "adcs x15, x15, xzr\n\t" - "extr x6, x6, x5, 32\n\t" - "adc x8, x8, xzr\n\t" - "extr x5, x5, x4, 32\n\t" - "lsl x4, x4, 32\n\t" - "adds x9, x9, x4\n\t" - "adcs x10, x10, x5\n\t" - "adcs x11, x11, x6\n\t" - "adcs x12, x12, x7\n\t" - "adcs x13, x13, x16\n\t" - "adcs x14, x14, xzr\n\t" - "adcs x15, x15, xzr\n\t" - "adc x8, x8, xzr\n\t" + "adds x8, x8, x3\n\t" + "extr x16, x14, x13, 32\n\t" + "adcs x9, x9, x4\n\t" + "extr x15, x13, x4, 32\n\t" + "adcs x10, x10, x13\n\t" + "extr x4, x4, x3, 32\n\t" + "adcs x11, x11, x14\n\t" + "lsl x3, x3, 32\n\t" + "adc x13, xzr, xzr\n\t" + "adds x6, x6, x3\n\t" + "lsr x17, x14, 32\n\t" + "adcs x7, x7, x4\n\t" + "adcs x8, x8, x15\n\t" + "adcs x9, x9, x16\n\t" + "adcs x10, x10, x17\n\t" + "adcs x11, x11, xzr\n\t" + "adcs x12, x12, x13\n\t" + "adc x5, x5, xzr\n\t" "# a -= (mu << 32) << 192\n\t" - "subs x11, x11, x4\n\t" - "sbcs x12, x12, x5\n\t" - "sbcs x13, x13, x6\n\t" - "sbcs x14, x14, x7\n\t" - "sbcs x15, x15, x16\n\t" - "mov x19, 0xffffffff00000001\n\t" - "sbc x8, x8, xzr\n\t" - "neg x8, x8\n\t" + "subs x8, x8, x3\n\t" + "sbcs x9, x9, x4\n\t" + "sbcs x10, x10, x15\n\t" + "sbcs x11, x11, x16\n\t" + "sbcs x12, x12, x17\n\t" + "sbc x5, x5, xzr\n\t" + "neg x5, x5\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "subs x12, x12, x8\n\t" + "subs x9, x9, x5\n\t" "# m[1] = 0xffffffff & mask = mask >> 32 as mask is all 1s or 0s\n\t" - "lsr x17, x8, 32\n\t" - "sbcs x13, x13, x17\n\t" - "and x19, x19, x8\n\t" + "lsr x16, x5, 32\n\t" + "sbcs x10, x10, x16\n\t" + "sub x17, xzr, x16\n\t" "# m[2] = 0 & mask = 0\n\t" - "sbcs x14, x14, xzr\n\t" - "stp x12, x13, [%[r], 0]\n\t" + "sbcs x11, x11, xzr\n\t" + "stp x9, x10, [%[r], 0]\n\t" "# m[3] = 0xffffffff00000001 & mask\n\t" - "sbc x15, x15, x19\n\t" - "stp x14, x15, [%[r], 16]\n\t" - : [a] "+r" (a), [b] "+r" (b) - : [r] "r" (r) - : "memory", "x4", "x5", "x6", "x7", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "cc" + "sbc x12, x12, x17\n\t" + "stp x11, x12, [%[r], 16]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x3", "x4", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "cc" ); } @@ -22611,144 +22595,139 @@ SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, const sp_digit* a, const * m Modulus (prime). * mp Montgomery multiplier. */ -SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit* m, +static void sp_256_mont_sqr_4(sp_digit* r, const sp_digit* a, const sp_digit* m, sp_digit mp) { (void)m; (void)mp; __asm__ __volatile__ ( - "ldp x16, x17, [%[a], 0]\n\t" - "# A[0] * A[1]\n\t" - "mul x9, x16, x17\n\t" - "ldr x19, [%[a], 16]\n\t" - "umulh x10, x16, x17\n\t" - "ldr x20, [%[a], 24]\n\t" - "# A[0] * A[2]\n\t" - "mul x4, x16, x19\n\t" - "umulh x5, x16, x19\n\t" - "adds x10, x10, x4\n\t" - "# A[0] * A[3]\n\t" - "mul x4, x16, x20\n\t" - "adc x11, xzr, x5\n\t" - "umulh x5, x16, x20\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * A[2]\n\t" - "mul x4, x17, x19\n\t" - "adc x12, xzr, x5\n\t" - "umulh x5, x17, x19\n\t" - "adds x11, x11, x4\n\t" - "# A[1] * A[3]\n\t" - "mul x4, x17, x20\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x17, x20\n\t" - "adc x13, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" - "# A[2] * A[3]\n\t" - "mul x4, x19, x20\n\t" - "adc x13, x13, x5\n\t" - "umulh x5, x19, x20\n\t" - "adds x13, x13, x4\n\t" - "adc x14, xzr, x5\n\t" + "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "# A[0] * A[1]\n\t" + "umulh x6, x12, x13\n\t" + "mul x5, x12, x13\n\t" + "# A[0] * A[3]\n\t" + "umulh x8, x12, x15\n\t" + "mul x7, x12, x15\n\t" + "# A[0] * A[2]\n\t" + "mul x2, x12, x14\n\t" + "adds x6, x6, x2\n\t" + "umulh x3, x12, x14\n\t" + "adcs x7, x7, x3\n\t" + "# A[1] * A[3]\n\t" + "mul x2, x13, x15\n\t" + "adcs x8, x8, x2\n\t" + "umulh x9, x13, x15\n\t" + "adc x9, x9, xzr\n\t" + "# A[1] * A[2]\n\t" + "mul x2, x13, x14\n\t" + "adds x7, x7, x2\n\t" + "umulh x3, x13, x14\n\t" + "adcs x8, x8, x3\n\t" + "# A[2] * A[3]\n\t" + "mul x2, x14, x15\n\t" + "adcs x9, x9, x2\n\t" + "umulh x10, x14, x15\n\t" + "adc x10, x10, xzr\n\t" "# Double\n\t" - "adds x9, x9, x9\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "adcs x12, x12, x12\n\t" - "adcs x13, x13, x13\n\t" - "# A[0] * A[0]\n\t" - "mul x8, x16, x16\n\t" - "adcs x14, x14, x14\n\t" - "umulh x3, x16, x16\n\t" - "cset x15, cs\n\t" - "# A[1] * A[1]\n\t" - "mul x4, x17, x17\n\t" - "adds x9, x9, x3\n\t" - "umulh x5, x17, x17\n\t" - "adcs x10, x10, x4\n\t" - "# A[2] * A[2]\n\t" - "mul x6, x19, x19\n\t" - "adcs x11, x11, x5\n\t" - "umulh x7, x19, x19\n\t" - "adcs x12, x12, x6\n\t" - "# A[3] * A[3]\n\t" - "mul x16, x20, x20\n\t" - "adcs x13, x13, x7\n\t" - "umulh x17, x20, x20\n\t" - "adcs x14, x14, x16\n\t" - "mov x3, x8\n\t" - "adc x15, x15, x17\n\t" + "adc x11, xzr, xzr\n\t" + "# A[0] * A[0]\n\t" + "umulh x3, x12, x12\n\t" + "mul x4, x12, x12\n\t" + "# A[1] * A[1]\n\t" + "mul x2, x13, x13\n\t" + "adds x5, x5, x3\n\t" + "umulh x3, x13, x13\n\t" + "adcs x6, x6, x2\n\t" + "# A[2] * A[2]\n\t" + "mul x2, x14, x14\n\t" + "adcs x7, x7, x3\n\t" + "umulh x3, x14, x14\n\t" + "adcs x8, x8, x2\n\t" + "# A[3] * A[3]\n\t" + "mul x2, x15, x15\n\t" + "adcs x9, x9, x3\n\t" + "umulh x3, x15, x15\n\t" + "adcs x10, x10, x2\n\t" + "mov x2, x4\n\t" + "adc x11, x11, x3\n\t" "# Start Reduction\n\t" - "mov x4, x9\n\t" - "mov x5, x10\n\t" + "mov x3, x5\n\t" + "mov x12, x6\n\t" "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t" "# - a[0] << 32 << 192\n\t" "# + (a[0] * 2) << 192\n\t" "# a[0]-a[2] << 32\n\t" - "extr x21, x10, x9, 32\n\t" - "add x6, x11, x8\n\t" - "extr x20, x9, x8, 32\n\t" - "add x6, x6, x8\n\t" + "lsl x14, x4, #32\n\t" + "extr x16, x6, x5, 32\n\t" + "add x13, x7, x4\n\t" + "extr x15, x5, x4, 32\n\t" + "add x13, x13, x4\n\t" "# + a[0]-a[2] << 32 << 64\n\t" "# - a[0] << 32 << 192\n\t" - "adds x4, x4, x8, lsl #32\n\t" - "sub x6, x6, x8, lsl #32\n\t" - "adcs x5, x5, x20\n\t" - "adc x6, x6, x21\n\t" + "adds x3, x3, x14\n\t" + "sub x13, x13, x14\n\t" + "adcs x12, x12, x15\n\t" + "adc x13, x13, x16\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += mu << 256\n\t" - "adds x12, x12, x3\n\t" - "adcs x13, x13, x4\n\t" - "adcs x14, x14, x5\n\t" - "adcs x15, x15, x6\n\t" - "cset x8, cs\n\t" + "adds x8, x8, x2\n\t" + "adcs x9, x9, x3\n\t" + "adcs x10, x10, x12\n\t" + "adcs x11, x11, x13\n\t" + "adc x4, xzr, xzr\n\t" "# a += mu << 192\n\t" "# mu <<= 32\n\t" "# a += (mu << 32) << 64\n\t" - "adds x11, x11, x3\n\t" - "adcs x12, x12, x4\n\t" - "adcs x13, x13, x5\n\t" - "lsr x7, x6, 32\n\t" - "adcs x14, x14, x6\n\t" - "extr x6, x6, x5, 32\n\t" - "adcs x15, x15, xzr\n\t" - "extr x5, x5, x4, 32\n\t" - "adc x8, x8, xzr\n\t" - "extr x4, x4, x3, 32\n\t" - "lsl x3, x3, 32\n\t" - "adds x9, x9, x3\n\t" - "adcs x10, x10, x4\n\t" - "adcs x11, x11, x5\n\t" - "adcs x12, x12, x6\n\t" - "adcs x13, x13, x7\n\t" - "adcs x14, x14, xzr\n\t" - "adcs x15, x15, xzr\n\t" - "adc x8, x8, xzr\n\t" + "adds x7, x7, x2\n\t" + "extr x15, x13, x12, 32\n\t" + "adcs x8, x8, x3\n\t" + "extr x14, x12, x3, 32\n\t" + "adcs x9, x9, x12\n\t" + "extr x3, x3, x2, 32\n\t" + "adcs x10, x10, x13\n\t" + "lsl x2, x2, 32\n\t" + "adc x12, xzr, xzr\n\t" + "adds x5, x5, x2\n\t" + "lsr x16, x13, 32\n\t" + "adcs x6, x6, x3\n\t" + "adcs x7, x7, x14\n\t" + "adcs x8, x8, x15\n\t" + "adcs x9, x9, x16\n\t" + "adcs x10, x10, xzr\n\t" + "adcs x11, x11, x12\n\t" + "adc x4, x4, xzr\n\t" "# a -= (mu << 32) << 192\n\t" - "subs x11, x11, x3\n\t" - "sbcs x12, x12, x4\n\t" - "sbcs x13, x13, x5\n\t" - "sbcs x14, x14, x6\n\t" - "sbcs x15, x15, x7\n\t" - "mov x17, 0xffffffff00000001\n\t" - "sbc x8, x8, xzr\n\t" - "neg x8, x8\n\t" + "subs x7, x7, x2\n\t" + "sbcs x8, x8, x3\n\t" + "sbcs x9, x9, x14\n\t" + "sbcs x10, x10, x15\n\t" + "sbcs x11, x11, x16\n\t" + "sbc x4, x4, xzr\n\t" + "neg x4, x4\n\t" "# mask m and sub from result if overflow\n\t" "# m[0] = -1 & mask = mask\n\t" - "subs x12, x12, x8\n\t" + "subs x8, x8, x4\n\t" "# m[1] = 0xffffffff & mask = mask >> 32 as mask is all 1s or 0s\n\t" - "lsr x16, x8, 32\n\t" - "sbcs x13, x13, x16\n\t" - "and x17, x17, x8\n\t" + "lsr x15, x4, 32\n\t" + "sbcs x9, x9, x15\n\t" + "sub x16, xzr, x15\n\t" "# m[2] = 0 & mask = 0\n\t" - "sbcs x14, x14, xzr\n\t" - "stp x12, x13, [%[r], 0]\n\t" + "sbcs x10, x10, xzr\n\t" + "stp x8, x9, [%[r], 0]\n\t" "# m[3] = 0xffffffff00000001 & mask\n\t" - "sbc x15, x15, x17\n\t" - "stp x14, x15, [%[r], 16]\n\t" + "sbc x11, x11, x16\n\t" + "stp x10, x11, [%[r], 16]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "cc" + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "cc" ); } @@ -22990,52 +22969,51 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m, "# - a[0] << 32 << 192\n\t" "# + (a[0] * 2) << 192\n\t" "# a[0]-a[2] << 32\n\t" - "extr x20, x12, x11, 32\n\t" + "lsl x7, x10, #32\n\t" + "extr x9, x12, x11, 32\n\t" "add x6, x13, x10\n\t" - "extr x19, x11, x10, 32\n\t" + "extr x8, x11, x10, 32\n\t" "add x6, x6, x10\n\t" "# + a[0]-a[2] << 32 << 64\n\t" "# - a[0] << 32 << 192\n\t" - "adds x4, x4, x10, lsl #32\n\t" - "sub x6, x6, x10, lsl #32\n\t" - "adcs x5, x5, x19\n\t" - "adc x6, x6, x20\n\t" + "adds x4, x4, x7\n\t" + "sub x6, x6, x7\n\t" + "adcs x5, x5, x8\n\t" + "adc x6, x6, x9\n\t" "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t" "# a += mu << 256\n\t" "adds x14, x14, x3\n\t" "adcs x15, x15, x4\n\t" "adcs x16, x16, x5\n\t" "adcs x17, x17, x6\n\t" - "cset x10, cs\n\t" + "adc x10, xzr, xzr\n\t" "# a += mu << 192\n\t" "# mu <<= 32\n\t" "# a += (mu << 32) << 64\n\t" "adds x13, x13, x3\n\t" + "extr x8, x6, x5, 32\n\t" "adcs x14, x14, x4\n\t" + "extr x7, x5, x4, 32\n\t" "adcs x15, x15, x5\n\t" - "lsr x7, x6, 32\n\t" - "adcs x16, x16, x6\n\t" - "extr x6, x6, x5, 32\n\t" - "adcs x17, x17, xzr\n\t" - "extr x5, x5, x4, 32\n\t" - "adc x10, x10, xzr\n\t" "extr x4, x4, x3, 32\n\t" - "lsl x3, x3, 32\n\t" + "adcs x16, x16, x6\n\t" + "lsl x3, x3, 32\n\t" + "adc x5, xzr, xzr\n\t" "adds x11, x11, x3\n\t" + "lsr x9, x6, 32\n\t" "adcs x12, x12, x4\n\t" - "adcs x13, x13, x5\n\t" - "adcs x14, x14, x6\n\t" - "adcs x15, x15, x7\n\t" + "adcs x13, x13, x7\n\t" + "adcs x14, x14, x8\n\t" + "adcs x15, x15, x9\n\t" "adcs x16, x16, xzr\n\t" - "adcs x17, x17, xzr\n\t" + "adcs x17, x17, x5\n\t" "adc x10, x10, xzr\n\t" "# a -= (mu << 32) << 192\n\t" "subs x13, x13, x3\n\t" "sbcs x14, x14, x4\n\t" - "sbcs x15, x15, x5\n\t" - "sbcs x16, x16, x6\n\t" - "sbcs x17, x17, x7\n\t" - "mov x9, 0xffffffff00000001\n\t" + "sbcs x15, x15, x7\n\t" + "sbcs x16, x16, x8\n\t" + "sbcs x17, x17, x9\n\t" "sbc x10, x10, xzr\n\t" "neg x10, x10\n\t" "# mask m and sub from result if overflow\n\t" @@ -23044,7 +23022,7 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m, "# m[1] = 0xffffffff & mask = mask >> 32 as mask is all 1s or 0s\n\t" "lsr x8, x10, 32\n\t" "sbcs x15, x15, x8\n\t" - "and x9, x9, x10\n\t" + "sub x9, xzr, x8\n\t" "# m[2] = 0 & mask = 0\n\t" "sbcs x16, x16, xzr\n\t" "stp x14, x15, [%[a], 0]\n\t" @@ -23056,162 +23034,6 @@ SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, const sp_digit* m, : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x19", "x20", "cc" ); } -/* Reduce the number back to 256 bits using Montgomery reduction. - * - * a A single precision number to reduce in place. - * m The single precision number representing the modulus. - * mp The digit representing the negative inverse of m mod 2^n. - */ -SP_NOINLINE static void sp_256_mont_reduce_order_4(sp_digit* a, const sp_digit* m, - sp_digit mp) -{ - __asm__ __volatile__ ( - "ldp x9, x10, [%[a], 0]\n\t" - "ldp x11, x12, [%[a], 16]\n\t" - "ldp x17, x19, [%[m], 0]\n\t" - "ldp x20, x21, [%[m], 16]\n\t" - "mov x8, xzr\n\t" - "# mu = a[0] * mp\n\t" - "mul x5, %[mp], x9\n\t" - "ldr x13, [%[a], 32]\n\t" - "# a[0+0] += m[0] * mu\n\t" - "mul x3, x17, x5\n\t" - "ldr x14, [%[a], 40]\n\t" - "umulh x6, x17, x5\n\t" - "ldr x15, [%[a], 48]\n\t" - "adds x9, x9, x3\n\t" - "ldr x16, [%[a], 56]\n\t" - "adc x6, x6, xzr\n\t" - "# a[0+1] += m[1] * mu\n\t" - "mul x3, x19, x5\n\t" - "umulh x7, x19, x5\n\t" - "adds x3, x3, x6\n\t" - "adc x7, x7, xzr\n\t" - "adds x10, x10, x3\n\t" - "adc x7, x7, xzr\n\t" - "# a[0+2] += m[2] * mu\n\t" - "mul x3, x20, x5\n\t" - "umulh x6, x20, x5\n\t" - "adds x3, x3, x7\n\t" - "adc x6, x6, xzr\n\t" - "adds x11, x11, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[0+3] += m[3] * mu\n\t" - "mul x3, x21, x5\n\t" - "umulh x4, x21, x5\n\t" - "adds x3, x3, x6\n\t" - "adcs x4, x4, x8\n\t" - "cset x8, cs\n\t" - "adds x12, x12, x3\n\t" - "adcs x13, x13, x4\n\t" - "adc x8, x8, xzr\n\t" - "# mu = a[1] * mp\n\t" - "mul x5, %[mp], x10\n\t" - "# a[1+0] += m[0] * mu\n\t" - "mul x3, x17, x5\n\t" - "umulh x6, x17, x5\n\t" - "adds x10, x10, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[1+1] += m[1] * mu\n\t" - "mul x3, x19, x5\n\t" - "umulh x7, x19, x5\n\t" - "adds x3, x3, x6\n\t" - "adc x7, x7, xzr\n\t" - "adds x11, x11, x3\n\t" - "adc x7, x7, xzr\n\t" - "# a[1+2] += m[2] * mu\n\t" - "mul x3, x20, x5\n\t" - "umulh x6, x20, x5\n\t" - "adds x3, x3, x7\n\t" - "adc x6, x6, xzr\n\t" - "adds x12, x12, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[1+3] += m[3] * mu\n\t" - "mul x3, x21, x5\n\t" - "umulh x4, x21, x5\n\t" - "adds x3, x3, x6\n\t" - "adcs x4, x4, x8\n\t" - "cset x8, cs\n\t" - "adds x13, x13, x3\n\t" - "adcs x14, x14, x4\n\t" - "adc x8, x8, xzr\n\t" - "# mu = a[2] * mp\n\t" - "mul x5, %[mp], x11\n\t" - "# a[2+0] += m[0] * mu\n\t" - "mul x3, x17, x5\n\t" - "umulh x6, x17, x5\n\t" - "adds x11, x11, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[2+1] += m[1] * mu\n\t" - "mul x3, x19, x5\n\t" - "umulh x7, x19, x5\n\t" - "adds x3, x3, x6\n\t" - "adc x7, x7, xzr\n\t" - "adds x12, x12, x3\n\t" - "adc x7, x7, xzr\n\t" - "# a[2+2] += m[2] * mu\n\t" - "mul x3, x20, x5\n\t" - "umulh x6, x20, x5\n\t" - "adds x3, x3, x7\n\t" - "adc x6, x6, xzr\n\t" - "adds x13, x13, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[2+3] += m[3] * mu\n\t" - "mul x3, x21, x5\n\t" - "umulh x4, x21, x5\n\t" - "adds x3, x3, x6\n\t" - "adcs x4, x4, x8\n\t" - "cset x8, cs\n\t" - "adds x14, x14, x3\n\t" - "adcs x15, x15, x4\n\t" - "adc x8, x8, xzr\n\t" - "# mu = a[3] * mp\n\t" - "mul x5, %[mp], x12\n\t" - "# a[3+0] += m[0] * mu\n\t" - "mul x3, x17, x5\n\t" - "umulh x6, x17, x5\n\t" - "adds x12, x12, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[3+1] += m[1] * mu\n\t" - "mul x3, x19, x5\n\t" - "umulh x7, x19, x5\n\t" - "adds x3, x3, x6\n\t" - "adc x7, x7, xzr\n\t" - "adds x13, x13, x3\n\t" - "adc x7, x7, xzr\n\t" - "# a[3+2] += m[2] * mu\n\t" - "mul x3, x20, x5\n\t" - "umulh x6, x20, x5\n\t" - "adds x3, x3, x7\n\t" - "adc x6, x6, xzr\n\t" - "adds x14, x14, x3\n\t" - "adc x6, x6, xzr\n\t" - "# a[3+3] += m[3] * mu\n\t" - "mul x3, x21, x5\n\t" - "umulh x4, x21, x5\n\t" - "adds x3, x3, x6\n\t" - "adcs x4, x4, x8\n\t" - "cset x8, cs\n\t" - "adds x15, x15, x3\n\t" - "adcs x16, x16, x4\n\t" - "adc x8, x8, xzr\n\t" - "sub x3, xzr, x8\n\t" - "and x17, x17, x3\n\t" - "and x19, x19, x3\n\t" - "and x20, x20, x3\n\t" - "and x21, x21, x3\n\t" - "subs x13, x13, x17\n\t" - "sbcs x14, x14, x19\n\t" - "sbcs x15, x15, x20\n\t" - "stp x13, x14, [%[a], 0]\n\t" - "sbc x16, x16, x21\n\t" - "stp x15, x16, [%[a], 16]\n\t" - : - : [a] "r" (a), [m] "r" (m), [mp] "r" (mp) - : "memory", "x3", "x4", "x5", "x8", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "cc" - ); -} - /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -23252,41 +23074,6 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p, r->z[0] = 1; } -/* Add two Montgomery form numbers (r = a + b % m). - * - * r Result of addition. - * a First number to add in Montgomery form. - * b Second number to add in Montgomery form. - * m Modulus (prime). - */ -static void sp_256_mont_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) -{ - __asm__ __volatile__ ( - "ldp x4, x5, [%[a], 0]\n\t" - "ldp x8, x9, [%[b], 0]\n\t" - "adds x4, x4, x8\n\t" - "ldp x6, x7, [%[a], 16]\n\t" - "adcs x5, x5, x9\n\t" - "ldp x10, x11, [%[b], 16]\n\t" - "adcs x6, x6, x10\n\t" - "adcs x7, x7, x11\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cs\n\t" - "subs x4, x4, x14\n\t" - "lsr x12, x14, 32\n\t" - "sbcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" - "sbcs x6, x6, xzr\n\t" - "stp x4, x5, [%[r],0]\n\t" - "sbc x7, x7, x13\n\t" - "stp x6, x7, [%[r],16]\n\t" - : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "cc" - ); -} - /* Double a Montgomery form number (r = a + a % m). * * r Result of doubling. @@ -23298,23 +23085,30 @@ static void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* m) __asm__ __volatile__ ( "ldp x3, x4, [%[a]]\n\t" "ldp x5, x6, [%[a],16]\n\t" - "adds x3, x3, x3\n\t" - "adcs x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "mov x8, 0xffffffff00000001\n\t" - "csetm x9, cs\n\t" - "subs x3, x3, x9\n\t" - "lsr x7, x9, 32\n\t" - "sbcs x4, x4, x7\n\t" - "and x8, x8, x9\n\t" - "sbcs x5, x5, xzr\n\t" - "stp x3, x4, [%[r],0]\n\t" - "sbc x6, x6, x8\n\t" - "stp x5, x6, [%[r],16]\n\t" + "lsl x9, x3, #1\n\t" + "extr x10, x4, x3, #63\n\t" + "extr x11, x5, x4, #63\n\t" + "asr x13, x6, #63\n\t" + "extr x12, x6, x5, #63\n\t" + "subs x9, x9, x13\n\t" + "lsr x7, x13, 32\n\t" + "sbcs x10, x10, x7\n\t" + "sub x8, xzr, x7\n\t" + "sbcs x11, x11, xzr\n\t" + "sbcs x12, x12, x8\n\t" + "sbc x8, xzr, xzr\n\t" + "sub x13, x13, x8\n\t" + "subs x9, x9, x13\n\t" + "lsr x7, x13, 32\n\t" + "sbcs x10, x10, x7\n\t" + "sub x8, xzr, x7\n\t" + "sbcs x11, x11, xzr\n\t" + "stp x9, x10, [%[r],0]\n\t" + "sbc x12, x12, x8\n\t" + "stp x11, x12, [%[r],16]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc" + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x13", "cc" ); (void)m; @@ -23329,38 +23123,46 @@ static void sp_256_mont_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* m) static void sp_256_mont_tpl_4(sp_digit* r, const sp_digit* a, const sp_digit* m) { __asm__ __volatile__ ( - "ldp x10, x11, [%[a]]\n\t" - "adds x3, x10, x10\n\t" - "ldr x12, [%[a], 16]\n\t" - "adcs x4, x11, x11\n\t" - "ldr x13, [%[a], 24]\n\t" - "adcs x5, x12, x12\n\t" - "adcs x6, x13, x13\n\t" - "mov x8, 0xffffffff00000001\n\t" - "csetm x9, cs\n\t" - "subs x3, x3, x9\n\t" - "lsr x7, x9, 32\n\t" + "ldp x9, x10, [%[a]]\n\t" + "ldp x11, x12, [%[a], 16]\n\t" + "lsl x3, x9, #1\n\t" + "extr x4, x10, x9, #63\n\t" + "extr x5, x11, x10, #63\n\t" + "asr x13, x12, #63\n\t" + "extr x6, x12, x11, #63\n\t" + "subs x3, x3, x13\n\t" + "lsr x7, x13, 32\n\t" "sbcs x4, x4, x7\n\t" - "and x8, x8, x9\n\t" + "sub x8, xzr, x7\n\t" "sbcs x5, x5, xzr\n\t" - "sbc x6, x6, x8\n\t" - "adds x3, x3, x10\n\t" - "adcs x4, x4, x11\n\t" - "adcs x5, x5, x12\n\t" - "adcs x6, x6, x13\n\t" - "mov x8, 0xffffffff00000001\n\t" - "csetm x9, cs\n\t" - "subs x3, x3, x9\n\t" - "lsr x7, x9, 32\n\t" + "sbcs x6, x6, x8\n\t" + "neg x13, x13\n\t" + "sbc x13, x13, xzr\n\t" + "adds x3, x3, x9\n\t" + "adcs x4, x4, x10\n\t" + "adcs x5, x5, x11\n\t" + "adcs x6, x6, x12\n\t" + "adc x13, x13, xzr\n\t" + "neg x13, x13\n\t" + "subs x3, x3, x13, asr #1\n\t" + "lsr x7, x13, 32\n\t" "sbcs x4, x4, x7\n\t" - "and x8, x8, x9\n\t" + "sub x8, xzr, x7\n\t" + "sbcs x5, x5, xzr\n\t" + "sbcs x6, x6, x8\n\t" + "sbc x8, xzr, xzr\n\t" + "sub x13, x13, x8\n\t" + "subs x3, x3, x13\n\t" + "lsr x7, x13, 32\n\t" + "sbcs x4, x4, x7\n\t" + "sub x8, xzr, x7\n\t" "sbcs x5, x5, xzr\n\t" "stp x3, x4, [%[r], 0]\n\t" "sbc x6, x6, x8\n\t" "stp x5, x6, [%[r], 16]\n\t" : : [r] "r" (r), [a] "r" (a) - : "memory", "x10", "x11", "x12", "x13", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc" + : "memory", "x9", "x10", "x11", "x12", "x3", "x4", "x5", "x6", "x7", "x8", "x13", "cc" ); (void)m; @@ -23378,30 +23180,37 @@ static void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, { __asm__ __volatile__ ( "ldp x4, x5, [%[a], 0]\n\t" - "ldp x8, x9, [%[b], 0]\n\t" - "subs x4, x4, x8\n\t" "ldp x6, x7, [%[a], 16]\n\t" - "sbcs x5, x5, x9\n\t" + "ldp x8, x9, [%[b], 0]\n\t" "ldp x10, x11, [%[b], 16]\n\t" + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" "sbcs x6, x6, x10\n\t" "sbcs x7, x7, x11\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cc\n\t" + "sbc x14, xzr, xzr\n\t" "adds x4, x4, x14\n\t" "lsr x12, x14, 32\n\t" "adcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" + "sub x13, xzr, x12\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, x13\n\t" + "adc x14, x14, xzr\n\t" + "adds x4, x4, x14\n\t" + "lsr x12, x14, 32\n\t" + "adcs x5, x5, x12\n\t" + "sub x13, xzr, x12\n\t" "adcs x6, x6, xzr\n\t" "stp x4, x5, [%[r],0]\n\t" "adc x7, x7, x13\n\t" "stp x6, x7, [%[r],16]\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "cc" ); + + (void)m; } -#define sp_256_mont_sub_lower_4 sp_256_mont_sub_4 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -23412,27 +23221,166 @@ static void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m) { __asm__ __volatile__ ( "ldp x3, x4, [%[a], 0]\n\t" - "and x9, x3, 1\n\t" "ldp x5, x6, [%[a], 16]\n\t" - "sub x10, xzr, x9\n\t" - "lsr x7, x10, 32\n\t" - "adds x3, x3, x10\n\t" - "and x8, x10, 0xffffffff00000001\n\t" + "sbfx x8, x3, 0, 1\n\t" + "adds x3, x3, x8\n\t" + "lsr x7, x8, 32\n\t" "adcs x4, x4, x7\n\t" + "sub x8, xzr, x7\n\t" "adcs x5, x5, xzr\n\t" "extr x3, x4, x3, 1\n\t" "adcs x6, x6, x8\n\t" "extr x4, x5, x4, 1\n\t" - "cset x9, cs\n\t" + "adc x9, xzr, xzr\n\t" "extr x5, x6, x5, 1\n\t" "extr x6, x9, x6, 1\n\t" "stp x3, x4, [%[r], 0]\n\t" "stp x5, x6, [%[r], 16]\n\t" : : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" + : "memory", "x3", "x4", "x5", "x6", "x7", "x9", "x8", "cc" + ); +} + +/* Double number and subtract (r = (a - 2.b) % m). + * + * r Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +static void sp_256_mont_rsb_sub_dbl_4(sp_digit* r, const sp_digit* a, + sp_digit* b, const sp_digit* m) +{ + __asm__ __volatile__ ( + "ldp x8, x9, [%[b]]\n\t" + "ldp x10, x11, [%[b],16]\n\t" + "lsl x15, x8, #1\n\t" + "extr x16, x9, x8, #63\n\t" + "extr x17, x10, x9, #63\n\t" + "asr x14, x11, #63\n\t" + "extr x19, x11, x10, #63\n\t" + "ldp x4, x5, [%[a]]\n\t" + "ldp x6, x7, [%[a],16]\n\t" + "subs x15, x15, x14\n\t" + "lsr x12, x14, 32\n\t" + "sbcs x16, x16, x12\n\t" + "sub x13, xzr, x12\n\t" + "sbcs x17, x17, xzr\n\t" + "sbcs x19, x19, x13\n\t" + "neg x14, x14\n\t" + "sbc x14, x14, xzr\n\t" + "subs x15, x4, x15\n\t" + "sbcs x16, x5, x16\n\t" + "sbcs x17, x6, x17\n\t" + "sbcs x19, x7, x19\n\t" + "sbc x14, xzr, x14\n\t" + "adds x15, x15, x14, asr #1\n\t" + "lsr x12, x14, 32\n\t" + "adcs x16, x16, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, x13\n\t" + "adc x14, x14, xzr\n\t" + "adds x15, x15, x14\n\t" + "lsr x12, x14, 32\n\t" + "adcs x16, x16, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x17, x17, xzr\n\t" + "stp x15, x16, [%[r],0]\n\t" + "adc x19, x19, x13\n\t" + "stp x17, x19, [%[r],16]\n\t" + "subs x15, x8, x15\n\t" + "sbcs x16, x9, x16\n\t" + "sbcs x17, x10, x17\n\t" + "sbcs x19, x11, x19\n\t" + "sbc x14, xzr, xzr\n\t" + "adds x15, x15, x14\n\t" + "lsr x12, x14, 32\n\t" + "adcs x16, x16, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x17, x17, xzr\n\t" + "adcs x19, x19, x13\n\t" + "adc x14, x14, xzr\n\t" + "adds x15, x15, x14\n\t" + "lsr x12, x14, 32\n\t" + "adcs x16, x16, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x17, x17, xzr\n\t" + "stp x15, x16, [%[b],0]\n\t" + "adc x19, x19, x13\n\t" + "stp x17, x19, [%[b],16]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "cc" ); + (void)m; +} + +/* Subtract two Montgomery form numbers (r = a - b % m). + * + * ra Result of addition. + * rs Result of subtration. + * a Number to subtract from in Montgomery form. + * b Number to subtract with in Montgomery form. + * m Modulus (prime). + */ +static void sp_256_mont_add_sub_4(sp_digit* ra, sp_digit* rs, const sp_digit* a, + const sp_digit* b, const sp_digit* m) +{ + __asm__ __volatile__ ( + "ldp x4, x5, [%[a], 0]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "adds x14, x4, x8\n\t" + "adcs x15, x5, x9\n\t" + "adcs x16, x6, x10\n\t" + "adcs x17, x7, x11\n\t" + "csetm x19, cs\n\t" + "subs x14, x14, x19\n\t" + "lsr x12, x19, 32\n\t" + "sbcs x15, x15, x12\n\t" + "sub x13, xzr, x12\n\t" + "sbcs x16, x16, xzr\n\t" + "sbcs x17, x17, x13\n\t" + "sbc x13, xzr, xzr\n\t" + "sub x19, x19, x13\n\t" + "subs x14, x14, x19\n\t" + "lsr x12, x19, 32\n\t" + "sbcs x15, x15, x12\n\t" + "sub x13, xzr, x12\n\t" + "sbcs x16, x16, xzr\n\t" + "stp x14, x15, [%[ra],0]\n\t" + "sbc x17, x17, x13\n\t" + "stp x16, x17, [%[ra],16]\n\t" + "subs x4, x4, x8\n\t" + "sbcs x5, x5, x9\n\t" + "sbcs x6, x6, x10\n\t" + "sbcs x7, x7, x11\n\t" + "sbc x19, xzr, xzr\n\t" + "adds x4, x4, x19\n\t" + "lsr x12, x19, 32\n\t" + "adcs x5, x5, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x6, x6, xzr\n\t" + "adcs x7, x7, x13\n\t" + "adc x19, x19, xzr\n\t" + "adds x4, x4, x19\n\t" + "lsr x12, x19, 32\n\t" + "adcs x5, x5, x12\n\t" + "sub x13, xzr, x12\n\t" + "adcs x6, x6, xzr\n\t" + "stp x4, x5, [%[rs],0]\n\t" + "adc x7, x7, x13\n\t" + "stp x6, x7, [%[rs],16]\n\t" + : + : [ra] "r" (ra), [rs] "r" (rs), [a] "r" (a), [b] "r" (b) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x19", "x14", "x15", "x16", "x17", "cc" + ); + + (void)m; } /* Double the Montgomery form projective point p. @@ -23464,10 +23412,8 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_256_mont_mul_4(z, p->y, p->z, p256_mod, p256_mp_mod); /* Z = 2Z */ sp_256_mont_dbl_4(z, z, p256_mod); - /* T2 = X - T1 */ - sp_256_mont_sub_4(t2, p->x, t1, p256_mod); - /* T1 = X + T1 */ - sp_256_mont_add_4(t1, p->x, t1, p256_mod); + /* T1/T2 = X +/- T1 */ + sp_256_mont_add_sub_4(t1, t2, p->x, t1, p256_mod); /* T2 = T1 * T2 */ sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod); /* T1 = 3T2 */ @@ -23484,12 +23430,9 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod); /* X = T1 * T1 */ sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod); - /* X = X - Y */ - sp_256_mont_sub_4(x, x, y, p256_mod); - /* X = X - Y */ - sp_256_mont_sub_4(x, x, y, p256_mod); + /* X = X - 2*Y */ /* Y = Y - X */ - sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -23550,13 +23493,11 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con ctx->state = 4; break; case 4: - /* T2 = X - T1 */ - sp_256_mont_sub_4(ctx->t2, p->x, ctx->t1, p256_mod); + /* T1/T2 = X +/- T1 */ + sp_256_mont_add_sub_4(ctx->t1, ctx->t2, p->x, ctx->t1, p256_mod); ctx->state = 5; break; case 5: - /* T1 = X + T1 */ - sp_256_mont_add_4(ctx->t1, p->x, ctx->t1, p256_mod); ctx->state = 6; break; case 6: @@ -23600,18 +23541,15 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con ctx->state = 14; break; case 14: - /* X = X - Y */ - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* X = X - 2*Y */ + /* Y = Y - X */ + sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 15; break; case 15: - /* X = X - Y */ - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 16; break; case 16: - /* Y = Y - X */ - sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -23636,101 +23574,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_256_mont_tpl_lower_4 sp_256_mont_tpl_4 -/* Subtract two Montgomery form numbers (r = a - b % m). - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -static void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) -{ - __asm__ __volatile__ ( - "ldp x8, x9, [%[b]]\n\t" - "ldp x10, x11, [%[b],16]\n\t" - "adds x8, x8, x8\n\t" - "ldp x4, x5, [%[a]]\n\t" - "adcs x9, x9, x9\n\t" - "ldp x6, x7, [%[a],16]\n\t" - "adcs x10, x10, x10\n\t" - "adcs x11, x11, x11\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cs\n\t" - "subs x8, x8, x14\n\t" - "lsr x12, x14, 32\n\t" - "sbcs x9, x9, x12\n\t" - "and x13, x13, x14\n\t" - "sbcs x10, x10, xzr\n\t" - "sbc x11, x11, x13\n\t" - "subs x4, x4, x8\n\t" - "sbcs x5, x5, x9\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cc\n\t" - "adds x4, x4, x14\n\t" - "lsr x12, x14, 32\n\t" - "adcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" - "adcs x6, x6, xzr\n\t" - "stp x4, x5, [%[r],0]\n\t" - "adc x7, x7, x13\n\t" - "stp x6, x7, [%[r],16]\n\t" - : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "cc" - ); -} - -/* Subtract two Montgomery form numbers (r = a - b % m). - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -static void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) -{ - __asm__ __volatile__ ( - "ldp x4, x5, [%[a], 0]\n\t" - "ldp x8, x9, [%[b], 0]\n\t" - "subs x4, x4, x8\n\t" - "ldp x6, x7, [%[a], 16]\n\t" - "sbcs x5, x5, x9\n\t" - "ldp x10, x11, [%[b], 16]\n\t" - "sbcs x6, x6, x10\n\t" - "sbcs x7, x7, x11\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cc\n\t" - "adds x4, x4, x14\n\t" - "lsr x12, x14, 32\n\t" - "adcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" - "adcs x6, x6, xzr\n\t" - "adc x7, x7, x13\n\t" - "adds x4, x4, x4\n\t" - "adcs x5, x5, x5\n\t" - "adcs x6, x6, x6\n\t" - "adcs x7, x7, x7\n\t" - "mov x13, 0xffffffff00000001\n\t" - "csetm x14, cs\n\t" - "subs x4, x4, x14\n\t" - "lsr x12, x14, 32\n\t" - "sbcs x5, x5, x12\n\t" - "and x13, x13, x14\n\t" - "sbcs x6, x6, xzr\n\t" - "stp x4, x5, [%[r],0]\n\t" - "sbc x7, x7, x13\n\t" - "stp x6, x7, [%[r],16]\n\t" - : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "cc" - ); -} - /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -23768,15 +23611,15 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -23796,15 +23639,15 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -23852,12 +23695,12 @@ static int sp_256_iszero_4(const sp_digit* a) static void sp_256_proj_point_add_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*4; + sp_digit* t2 = t + 4*4; + sp_digit* t3 = t + 6*4; + sp_digit* t4 = t + 8*4; + sp_digit* t5 = t + 10*4; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); @@ -23879,17 +23722,9 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_4(t2, t2, t1, p256_mod); @@ -23905,22 +23740,74 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(x, x, t5, p256_mod); sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, y, p256_mod); sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t5, p256_mod); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; +{ + __asm__ __volatile__ ( + "ldrsw x10, [%[p], #192]\n\t" + "ldrsw x11, [%[q], #192]\n\t" + "ldp x12, x13, [%[x], #0]\n\t" + "ldp x14, x15, [%[x], #16]\n\t" + "ldp x16, x17, [%[y], #0]\n\t" + "ldp x19, x20, [%[y], #16]\n\t" + "ldp x21, x22, [%[z], #0]\n\t" + "ldp x23, x24, [%[z], #16]\n\t" + "bics xzr, x11, x10\n\t" + "ldp x25, x26, [%[p], #0]\n\t" + "ldp x27, x28, [%[p], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[p], #64]\n\t" + "ldp x27, x28, [%[p], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[p], #128]\n\t" + "ldp x27, x28, [%[p], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "bics xzr, x10, x11\n\t" + "and x10, x10, x11\n\t" + "ldp x25, x26, [%[q], #0]\n\t" + "ldp x27, x28, [%[q], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[q], #64]\n\t" + "ldp x27, x28, [%[q], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[q], #128]\n\t" + "ldp x27, x28, [%[q], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "orr x21, x21, x10\n\t" + "stp x12, x13, [%[r], #0]\n\t" + "stp x14, x15, [%[r], #16]\n\t" + "stp x16, x17, [%[r], #64]\n\t" + "stp x19, x20, [%[r], #80]\n\t" + "stp x21, x22, [%[r], #128]\n\t" + "stp x23, x24, [%[r], #144]\n\t" + "str w10, [%[r], #192]\n\t" + : + : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (x), + [y] "r" (y), [z] "r" (z) + : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} } } @@ -23966,12 +23853,12 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*4; - ctx->t3 = t + 4*4; - ctx->t4 = t + 6*4; - ctx->t5 = t + 8*4; - ctx->t6 = t + 10*4; + ctx->t6 = t; + ctx->t1 = t + 2*4; + ctx->t2 = t + 4*4; + ctx->t3 = t + 6*4; + ctx->t4 = t + 8*4; + ctx->t5 = t + 10*4; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -24072,12 +23959,11 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, ctx->state = 20; break; case 20: - sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ + sp_256_mont_rsb_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 21; break; case 21: - /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -24090,22 +23976,70 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); - - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; +{ + __asm__ __volatile__ ( + "ldrsw x10, [%[p], #192]\n\t" + "ldrsw x11, [%[q], #192]\n\t" + "ldp x12, x13, [%[x], #0]\n\t" + "ldp x14, x15, [%[x], #16]\n\t" + "ldp x16, x17, [%[y], #0]\n\t" + "ldp x19, x20, [%[y], #16]\n\t" + "ldp x21, x22, [%[z], #0]\n\t" + "ldp x23, x24, [%[z], #16]\n\t" + "bics xzr, x11, x10\n\t" + "ldp x25, x26, [%[p], #0]\n\t" + "ldp x27, x28, [%[p], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[p], #64]\n\t" + "ldp x27, x28, [%[p], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[p], #128]\n\t" + "ldp x27, x28, [%[p], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "bics xzr, x10, x11\n\t" + "and x10, x10, x11\n\t" + "ldp x25, x26, [%[q], #0]\n\t" + "ldp x27, x28, [%[q], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[q], #64]\n\t" + "ldp x27, x28, [%[q], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[q], #128]\n\t" + "ldp x27, x28, [%[q], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "orr x21, x21, x10\n\t" + "stp x12, x13, [%[r], #0]\n\t" + "stp x14, x15, [%[r], #16]\n\t" + "stp x16, x17, [%[r], #64]\n\t" + "stp x19, x20, [%[r], #80]\n\t" + "stp x21, x22, [%[r], #128]\n\t" + "stp x23, x24, [%[r], #144]\n\t" + "str w10, [%[r], #192]\n\t" + : + : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (ctx->x), + [y] "r" (ctx->y), [z] "r" (ctx->z) + : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} ctx->state = 25; break; } @@ -24163,16 +24097,16 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); x = r[j].x; /* X = A^2 - 2B */ sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod); - sp_256_mont_sub_dbl_4(x, x, b, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, x, b, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_dbl_sub_4(b, b, x, p256_mod); + sp_256_mont_dbl_4(b, b, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; @@ -24237,10 +24171,8 @@ static void sp_256_proj_point_add_sub_4(sp_point_256* ra, sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod); /* H = U2 - U1 */ sp_256_mont_sub_4(t2, t2, t1, p256_mod); - /* RS = S2 + S1 */ - sp_256_mont_add_4(t6, t4, t3, p256_mod); - /* R = S2 - S1 */ - sp_256_mont_sub_4(t4, t4, t3, p256_mod); + /* RS/R = S2 +/ S1 */ + sp_256_mont_add_sub_4(t6, t4, t4, t3, p256_mod); /* Z3 = H*Z1*Z2 */ /* ZS = H*Z1*Z2 */ sp_256_mont_mul_4(za, za, q->z, p256_mod, p256_mp_mod); @@ -24260,8 +24192,8 @@ static void sp_256_proj_point_add_sub_4(sp_point_256* ra, sp_256_mont_sub_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_lower_4(ys, ya, xs, p256_mod); - sp_256_mont_sub_lower_4(ya, ya, xa, p256_mod); + sp_256_mont_sub_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_4(ya, ya, xa, p256_mod); sp_256_mont_mul_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_4(ys, ys, t6, p256_mod, p256_mp_mod); @@ -24351,36 +24283,63 @@ static void sp_256_ecc_recode_6_4(const sp_digit* k, ecc_recode_256* v) static void sp_256_get_point_33_4(sp_point_256* r, const sp_point_256* table, int idx) { - int i; - sp_digit mask; - - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->z[0] = 0; - r->z[1] = 0; - r->z[2] = 0; - r->z[3] = 0; - for (i = 1; i < 33; i++) { - mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->z[0] |= mask & table[i].z[0]; - r->z[1] |= mask & table[i].z[1]; - r->z[2] |= mask & table[i].z[2]; - r->z[3] |= mask & table[i].z[3]; - } + __asm__ __volatile__ ( + "mov w30, #1\n\t" + "add %[table], %[table], #200\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x15, x16, [%[table], #0]\n\t" + "ldp x17, x19, [%[table], #16]\n\t" + "csel x3, xzr, x15, ne\n\t" + "csel x4, xzr, x16, ne\n\t" + "csel x5, xzr, x17, ne\n\t" + "csel x6, xzr, x19, ne\n\t" + "ldp x15, x16, [%[table], #64]\n\t" + "ldp x17, x19, [%[table], #80]\n\t" + "csel x7, xzr, x15, ne\n\t" + "csel x8, xzr, x16, ne\n\t" + "csel x9, xzr, x17, ne\n\t" + "csel x10, xzr, x19, ne\n\t" + "ldp x15, x16, [%[table], #128]\n\t" + "ldp x17, x19, [%[table], #144]\n\t" + "csel x11, xzr, x15, ne\n\t" + "csel x12, xzr, x16, ne\n\t" + "csel x13, xzr, x17, ne\n\t" + "csel x14, xzr, x19, ne\n\t" + "1:\n\t" + "add %[table], %[table], #200\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x15, x16, [%[table], #0]\n\t" + "ldp x17, x19, [%[table], #16]\n\t" + "csel x3, x3, x15, ne\n\t" + "csel x4, x4, x16, ne\n\t" + "csel x5, x5, x17, ne\n\t" + "csel x6, x6, x19, ne\n\t" + "ldp x15, x16, [%[table], #64]\n\t" + "ldp x17, x19, [%[table], #80]\n\t" + "csel x7, x7, x15, ne\n\t" + "csel x8, x8, x16, ne\n\t" + "csel x9, x9, x17, ne\n\t" + "csel x10, x10, x19, ne\n\t" + "ldp x15, x16, [%[table], #128]\n\t" + "ldp x17, x19, [%[table], #144]\n\t" + "csel x11, x11, x15, ne\n\t" + "csel x12, x12, x16, ne\n\t" + "csel x13, x13, x17, ne\n\t" + "csel x14, x14, x19, ne\n\t" + "cmp w30, #33\n\t" + "b.ne 1b\n\t" + "stp x3, x4, [%[r], #0]\n\t" + "stp x5, x6, [%[r], #16]\n\t" + "stp x7, x8, [%[r], #64]\n\t" + "stp x9, x10, [%[r], #80]\n\t" + "stp x11, x12, [%[r], #128]\n\t" + "stp x13, x14, [%[r], #144]\n\t" + : [table] "+r" (table) + : [r] "r" (r), [idx] "r" (idx) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30" + ); } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -24423,7 +24382,7 @@ static int sp_256_ecc_mulmod_win_add_sub_4(sp_point_256* r, const sp_point_256* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * + t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -24528,15 +24487,12 @@ static int sp_256_ecc_mulmod_win_add_sub_4(sp_point_256* r, const sp_point_256* return err; } -#ifndef WC_NO_CACHE_RESISTANT /* A table entry for pre-computed points. */ typedef struct sp_table_entry_256 { sp_digit x[4]; sp_digit y[4]; } sp_table_entry_256; -#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) -#endif /* FP_ECC | WOLFSSL_SP_SMALL */ /* Add two Montgomery form projective points. The second point has a q value of * one. * Only the first point can be the same pointer as the result point. @@ -24549,12 +24505,11 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*4; + sp_digit* t6 = t + 4*4; + sp_digit* t1 = t + 6*4; + sp_digit* t4 = t + 8*4; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -24570,13 +24525,9 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_4(t2, t2, p->x, p256_mod); @@ -24585,35 +24536,86 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_4(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(x, t1, t5, p256_mod); - sp_256_mont_sub_dbl_4(x, x, t3, p256_mod); + sp_256_mont_sqr_4(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(t2, t2, t1, p256_mod); + sp_256_mont_rsb_sub_dbl_4(x, t2, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_4(t3, t3, x, p256_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(y, t3, t5, p256_mod); - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; + sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(y, t3, t1, p256_mod); +{ + __asm__ __volatile__ ( + "ldrsw x10, [%[p], #192]\n\t" + "ldrsw x11, [%[q], #192]\n\t" + "ldp x12, x13, [%[x], #0]\n\t" + "ldp x14, x15, [%[x], #16]\n\t" + "ldp x16, x17, [%[y], #0]\n\t" + "ldp x19, x20, [%[y], #16]\n\t" + "ldp x21, x22, [%[z], #0]\n\t" + "ldp x23, x24, [%[z], #16]\n\t" + "bics xzr, x11, x10\n\t" + "ldp x25, x26, [%[p], #0]\n\t" + "ldp x27, x28, [%[p], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[p], #64]\n\t" + "ldp x27, x28, [%[p], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[p], #128]\n\t" + "ldp x27, x28, [%[p], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "bics xzr, x10, x11\n\t" + "and x10, x10, x11\n\t" + "ldp x25, x26, [%[q], #0]\n\t" + "ldp x27, x28, [%[q], #16]\n\t" + "csel x12, x12, x25, eq\n\t" + "csel x13, x13, x26, eq\n\t" + "csel x14, x14, x27, eq\n\t" + "csel x15, x15, x28, eq\n\t" + "ldp x25, x26, [%[q], #64]\n\t" + "ldp x27, x28, [%[q], #80]\n\t" + "csel x16, x16, x25, eq\n\t" + "csel x17, x17, x26, eq\n\t" + "csel x19, x19, x27, eq\n\t" + "csel x20, x20, x28, eq\n\t" + "ldp x25, x26, [%[q], #128]\n\t" + "ldp x27, x28, [%[q], #144]\n\t" + "csel x21, x21, x25, eq\n\t" + "csel x22, x22, x26, eq\n\t" + "csel x23, x23, x27, eq\n\t" + "csel x24, x24, x28, eq\n\t" + "orr x21, x21, x10\n\t" + "stp x12, x13, [%[r], #0]\n\t" + "stp x14, x15, [%[r], #16]\n\t" + "stp x16, x17, [%[r], #64]\n\t" + "stp x19, x20, [%[r], #80]\n\t" + "stp x21, x22, [%[r], #128]\n\t" + "stp x23, x24, [%[r], #144]\n\t" + "str w10, [%[r], #192]\n\t" + : + : [r] "r" (r), [p] "r" (p), [q] "r" (q), [x] "r" (x), + [y] "r" (y), [z] "r" (z) + : "memory", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} } } +#ifndef WC_NO_CACHE_RESISTANT +#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) +#endif /* FP_ECC | WOLFSSL_SP_SMALL */ #ifdef FP_ECC /* Convert the projective point to affine. * Ordinates are in Montgomery form. @@ -24739,28 +24741,49 @@ static int sp_256_gen_stripe_table_4(const sp_point_256* a, static void sp_256_get_entry_64_4(sp_point_256* r, const sp_table_entry_256* table, int idx) { - int i; - sp_digit mask; - - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - for (i = 1; i < 64; i++) { - mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - } + __asm__ __volatile__ ( + "mov w30, #1\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, xzr, x11, ne\n\t" + "csel x4, xzr, x12, ne\n\t" + "csel x5, xzr, x13, ne\n\t" + "csel x6, xzr, x14, ne\n\t" + "csel x7, xzr, x15, ne\n\t" + "csel x8, xzr, x16, ne\n\t" + "csel x9, xzr, x17, ne\n\t" + "csel x10, xzr, x19, ne\n\t" + "1:\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, x3, x11, ne\n\t" + "csel x4, x4, x12, ne\n\t" + "csel x5, x5, x13, ne\n\t" + "csel x6, x6, x14, ne\n\t" + "csel x7, x7, x15, ne\n\t" + "csel x8, x8, x16, ne\n\t" + "csel x9, x9, x17, ne\n\t" + "csel x10, x10, x19, ne\n\t" + "cmp w30, #64\n\t" + "b.ne 1b\n\t" + "stp x3, x4, [%[r], #0]\n\t" + "stp x5, x6, [%[r], #16]\n\t" + "stp x7, x8, [%[r], #64]\n\t" + "stp x9, x10, [%[r], #80]\n\t" + : [table] "+r" (table) + : [r] "r" (r), [idx] "r" (idx) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30" + ); } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -24788,7 +24811,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 6]; + sp_digit t[2 * 4 * 5]; #endif sp_point_256* p = NULL; int i; @@ -24809,7 +24832,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -24993,13 +25016,13 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifdef WOLFSSL_SP_SMALL_STACK sp_digit* tmp; #else - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_cache_256_t* cache; int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; } @@ -25147,28 +25170,49 @@ static int sp_256_gen_stripe_table_4(const sp_point_256* a, static void sp_256_get_entry_256_4(sp_point_256* r, const sp_table_entry_256* table, int idx) { - int i; - sp_digit mask; - - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - for (i = 1; i < 256; i++) { - mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - } + __asm__ __volatile__ ( + "mov w30, #1\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, xzr, x11, ne\n\t" + "csel x4, xzr, x12, ne\n\t" + "csel x5, xzr, x13, ne\n\t" + "csel x6, xzr, x14, ne\n\t" + "csel x7, xzr, x15, ne\n\t" + "csel x8, xzr, x16, ne\n\t" + "csel x9, xzr, x17, ne\n\t" + "csel x10, xzr, x19, ne\n\t" + "1:\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, x3, x11, ne\n\t" + "csel x4, x4, x12, ne\n\t" + "csel x5, x5, x13, ne\n\t" + "csel x6, x6, x14, ne\n\t" + "csel x7, x7, x15, ne\n\t" + "csel x8, x8, x16, ne\n\t" + "csel x9, x9, x17, ne\n\t" + "csel x10, x10, x19, ne\n\t" + "cmp w30, #256\n\t" + "b.ne 1b\n\t" + "stp x3, x4, [%[r], #0]\n\t" + "stp x5, x6, [%[r], #16]\n\t" + "stp x7, x8, [%[r], #64]\n\t" + "stp x9, x10, [%[r], #80]\n\t" + : [table] "+r" (table) + : [r] "r" (r), [idx] "r" (idx) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30" + ); } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -25196,7 +25240,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 6]; + sp_digit t[2 * 4 * 5]; #endif sp_point_256* p = NULL; int i; @@ -25217,7 +25261,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -25401,13 +25445,13 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifdef WOLFSSL_SP_SMALL_STACK sp_digit* tmp; #else - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_cache_256_t* cache; int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; } @@ -25521,7 +25565,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -27325,28 +27369,49 @@ static void sp_256_ecc_recode_7_4(const sp_digit* k, ecc_recode_256* v) static void sp_256_get_entry_65_4(sp_point_256* r, const sp_table_entry_256* table, int idx) { - int i; - sp_digit mask; - - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - for (i = 1; i < 65; i++) { - mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - } + __asm__ __volatile__ ( + "mov w30, #1\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, xzr, x11, ne\n\t" + "csel x4, xzr, x12, ne\n\t" + "csel x5, xzr, x13, ne\n\t" + "csel x6, xzr, x14, ne\n\t" + "csel x7, xzr, x15, ne\n\t" + "csel x8, xzr, x16, ne\n\t" + "csel x9, xzr, x17, ne\n\t" + "csel x10, xzr, x19, ne\n\t" + "1:\n\t" + "add %[table], %[table], #64\n\t" + "cmp %w[idx], w30\n\t" + "add w30, w30, #1\n\t" + "ldp x11, x12, [%[table], #0]\n\t" + "ldp x13, x14, [%[table], #16]\n\t" + "ldp x15, x16, [%[table], #32]\n\t" + "ldp x17, x19, [%[table], #48]\n\t" + "csel x3, x3, x11, ne\n\t" + "csel x4, x4, x12, ne\n\t" + "csel x5, x5, x13, ne\n\t" + "csel x6, x6, x14, ne\n\t" + "csel x7, x7, x15, ne\n\t" + "csel x8, x8, x16, ne\n\t" + "csel x9, x9, x17, ne\n\t" + "csel x10, x10, x19, ne\n\t" + "cmp w30, #65\n\t" + "b.ne 1b\n\t" + "stp x3, x4, [%[r], #0]\n\t" + "stp x5, x6, [%[r], #16]\n\t" + "stp x7, x8, [%[r], #64]\n\t" + "stp x9, x10, [%[r], #80]\n\t" + : [table] "+r" (table) + : [r] "r" (r), [idx] "r" (idx) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "w30" + ); } #endif /* !WC_NO_CACHE_RESISTANT */ static const sp_table_entry_256 p256_table[2405] = { @@ -39328,7 +39393,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -39347,7 +39412,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -39406,7 +39471,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); #ifdef WOLFSSL_SP_SMALL_STACK XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -39515,7 +39580,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -39758,7 +39823,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -39766,7 +39831,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -40110,7 +40175,7 @@ static void sp_256_mul_d_4(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -40256,8 +40321,211 @@ static WC_INLINE int sp_256_mod_4(sp_digit* r, const sp_digit* a, const sp_digit */ static void sp_256_mont_mul_order_4(sp_digit* r, const sp_digit* a, const sp_digit* b) { - sp_256_mul_4(r, a, b); - sp_256_mont_reduce_order_4(r, p256_order, p256_mp_order); + __asm__ __volatile__ ( + "ldp x13, x14, [%[a], 0]\n\t" + "ldp x15, x16, [%[a], 16]\n\t" + "ldp x17, x19, [%[b], 0]\n\t" + "ldp x20, x21, [%[b], 16]\n\t" + "# A[0] * B[0]\n\t" + "umulh x6, x13, x17\n\t" + "mul x5, x13, x17\n\t" + "# A[2] * B[0]\n\t" + "umulh x8, x15, x17\n\t" + "mul x7, x15, x17\n\t" + "# A[1] * B[0]\n\t" + "mul x3, x14, x17\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x7, x7, x4\n\t" + "adc x8, x8, xzr\n\t" + "# A[0] * B[2]\n\t" + "mul x3, x13, x20\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x13, x20\n\t" + "adcs x8, x8, x4\n\t" + "# A[1] * B[3]\n\t" + "mul x9, x14, x21\n\t" + "adcs x9, x9, xzr\n\t" + "umulh x10, x14, x21\n\t" + "adc x10, x10, xzr\n\t" + "# A[0] * B[1]\n\t" + "mul x3, x13, x19\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x13, x19\n\t" + "adcs x7, x7, x4\n\t" + "# A[2] * B[1]\n\t" + "mul x3, x15, x19\n\t" + "adcs x8, x8, x3\n\t" + "umulh x4, x15, x19\n\t" + "adcs x9, x9, x4\n\t" + "adc x10, x10, xzr\n\t" + "# A[1] * B[2]\n\t" + "mul x3, x14, x20\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x20\n\t" + "adcs x9, x9, x4\n\t" + "adcs x10, x10, xzr\n\t" + "adc x11, xzr, xzr\n\t" + "# A[1] * B[1]\n\t" + "mul x3, x14, x19\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x19\n\t" + "adcs x8, x8, x4\n\t" + "# A[3] * B[1]\n\t" + "mul x3, x16, x19\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x19\n\t" + "adcs x10, x10, x4\n\t" + "adc x11, x11, xzr\n\t" + "# A[2] * B[2]\n\t" + "mul x3, x15, x20\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x15, x20\n\t" + "adcs x10, x10, x4\n\t" + "# A[3] * B[3]\n\t" + "mul x3, x16, x21\n\t" + "adcs x11, x11, x3\n\t" + "umulh x12, x16, x21\n\t" + "adc x12, x12, xzr\n\t" + "# A[0] * B[3]\n\t" + "mul x3, x13, x21\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x13, x21\n\t" + "adcs x9, x9, x4\n\t" + "# A[2] * B[3]\n\t" + "mul x3, x15, x21\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x15, x21\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + "# A[3] * B[0]\n\t" + "mul x3, x16, x17\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x9, x9, x4\n\t" + "# A[3] * B[2]\n\t" + "mul x3, x16, x20\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x16, x20\n\t" + "adcs x11, x11, x4\n\t" + "adc x12, x12, xzr\n\t" + "ldp x13, x14, [%[m], 0]\n\t" + "mov x15, 0xffffffffffffffff\n\t" + "mov x16, 0xffffffff00000000\n\t" + "# mu = a[0] * mp\n\t" + "mul x17, %[mp], x5\n\t" + "# a[0+0] += m[0] * mu\n\t" + "mul x3, x13, x17\n\t" + "adds x5, x5, x3\n\t" + "umulh x4, x13, x17\n\t" + "adcs x6, x6, x4\n\t" + "# a[0+2] += m[2] * mu\n\t" + "mul x3, x15, x17\n\t" + "adcs x7, x7, x3\n\t" + "umulh x4, x15, x17\n\t" + "adcs x8, x8, x4\n\t" + "adcs x9, x9, xzr\n\t" + "adc x19, xzr, xzr\n\t" + "# a[0+1] += m[1] * mu\n\t" + "mul x3, x14, x17\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x7, x7, x4\n\t" + "# a[0+3] += m[3] * mu\n\t" + "mul x3, x16, x17\n\t" + "adcs x8, x8, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x9, x9, x4\n\t" + "# mu = a[1] * mp\n\t" + "mul x17, %[mp], x6\n\t" + "adc x19, x19, xzr\n\t" + "# a[1+0] += m[0] * mu\n\t" + "mul x3, x13, x17\n\t" + "adds x6, x6, x3\n\t" + "umulh x4, x13, x17\n\t" + "adcs x7, x7, x4\n\t" + "# a[1+2] += m[2] * mu\n\t" + "mul x3, x15, x17\n\t" + "adcs x8, x8, x3\n\t" + "umulh x4, x15, x17\n\t" + "adcs x9, x9, x4\n\t" + "adcs x10, x10, x19\n\t" + "adc x19, xzr, xzr\n\t" + "# a[1+1] += m[1] * mu\n\t" + "mul x3, x14, x17\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x8, x8, x4\n\t" + "# a[1+3] += m[3] * mu\n\t" + "mul x3, x16, x17\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x10, x10, x4\n\t" + "# mu = a[2] * mp\n\t" + "mul x17, %[mp], x7\n\t" + "adc x19, x19, xzr\n\t" + "# a[2+0] += m[0] * mu\n\t" + "mul x3, x13, x17\n\t" + "adds x7, x7, x3\n\t" + "umulh x4, x13, x17\n\t" + "adcs x8, x8, x4\n\t" + "# a[2+2] += m[2] * mu\n\t" + "mul x3, x15, x17\n\t" + "adcs x9, x9, x3\n\t" + "umulh x4, x15, x17\n\t" + "adcs x10, x10, x4\n\t" + "adcs x11, x11, x19\n\t" + "adc x19, xzr, xzr\n\t" + "# a[2+1] += m[1] * mu\n\t" + "mul x3, x14, x17\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x9, x9, x4\n\t" + "# a[2+3] += m[3] * mu\n\t" + "mul x3, x16, x17\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x11, x11, x4\n\t" + "# mu = a[3] * mp\n\t" + "mul x17, %[mp], x8\n\t" + "adc x19, x19, xzr\n\t" + "# a[3+0] += m[0] * mu\n\t" + "mul x3, x13, x17\n\t" + "adds x8, x8, x3\n\t" + "umulh x4, x13, x17\n\t" + "adcs x9, x9, x4\n\t" + "# a[3+2] += m[2] * mu\n\t" + "mul x3, x15, x17\n\t" + "adcs x10, x10, x3\n\t" + "umulh x4, x15, x17\n\t" + "adcs x11, x11, x4\n\t" + "adcs x12, x12, x19\n\t" + "adc x19, xzr, xzr\n\t" + "# a[3+1] += m[1] * mu\n\t" + "mul x3, x14, x17\n\t" + "adds x9, x9, x3\n\t" + "umulh x4, x14, x17\n\t" + "adcs x10, x10, x4\n\t" + "# a[3+3] += m[3] * mu\n\t" + "mul x3, x16, x17\n\t" + "adcs x11, x11, x3\n\t" + "umulh x4, x16, x17\n\t" + "adcs x12, x12, x4\n\t" + "csel x13, x13, xzr, cs\n\t" + "csel x14, x14, xzr, cs\n\t" + "csel x15, x15, xzr, cs\n\t" + "csel x16, x16, xzr, cs\n\t" + "subs x9, x9, x13\n\t" + "sbcs x10, x10, x14\n\t" + "sbcs x11, x11, x15\n\t" + "stp x9, x10, [%[r], 0]\n\t" + "sbc x12, x12, x16\n\t" + "stp x11, x12, [%[r], 16]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (p256_order), + [mp] "r" (p256_mp_order) + : "memory", "x3", "x4", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "cc" + ); } #if defined(HAVE_ECC_SIGN) || (defined(HAVE_ECC_VERIFY) && defined(WOLFSSL_SP_SMALL)) @@ -40267,11 +40535,6 @@ static const uint64_t p256_order_minus_2[4] = { 0xf3b9cac2fc63254fU,0xbce6faada7179e84U,0xffffffffffffffffU, 0xffffffff00000000U }; -#else -/* The low half of the order-2 of the P256 curve. */ -static const sp_int_digit p256_order_low[2] = { - 0xf3b9cac2fc63254fU,0xbce6faada7179e84U -}; #endif /* WOLFSSL_SP_SMALL */ /* Square number mod the order of P256 curve. (r = a * a mod order) @@ -40281,8 +40544,179 @@ static const sp_int_digit p256_order_low[2] = { */ static void sp_256_mont_sqr_order_4(sp_digit* r, const sp_digit* a) { - sp_256_sqr_4(r, a); - sp_256_mont_reduce_order_4(r, p256_order, p256_mp_order); + __asm__ __volatile__ ( + "ldp x12, x13, [%[a], 0]\n\t" + "ldp x14, x15, [%[a], 16]\n\t" + "# A[0] * A[1]\n\t" + "umulh x6, x12, x13\n\t" + "mul x5, x12, x13\n\t" + "# A[0] * A[3]\n\t" + "umulh x8, x12, x15\n\t" + "mul x7, x12, x15\n\t" + "# A[0] * A[2]\n\t" + "mul x2, x12, x14\n\t" + "adds x6, x6, x2\n\t" + "umulh x3, x12, x14\n\t" + "adcs x7, x7, x3\n\t" + "# A[1] * A[3]\n\t" + "mul x2, x13, x15\n\t" + "adcs x8, x8, x2\n\t" + "umulh x9, x13, x15\n\t" + "adc x9, x9, xzr\n\t" + "# A[1] * A[2]\n\t" + "mul x2, x13, x14\n\t" + "adds x7, x7, x2\n\t" + "umulh x3, x13, x14\n\t" + "adcs x8, x8, x3\n\t" + "# A[2] * A[3]\n\t" + "mul x2, x14, x15\n\t" + "adcs x9, x9, x2\n\t" + "umulh x10, x14, x15\n\t" + "adc x10, x10, xzr\n\t" + "# Double\n\t" + "adds x5, x5, x5\n\t" + "adcs x6, x6, x6\n\t" + "adcs x7, x7, x7\n\t" + "adcs x8, x8, x8\n\t" + "adcs x9, x9, x9\n\t" + "adcs x10, x10, x10\n\t" + "adc x11, xzr, xzr\n\t" + "# A[0] * A[0]\n\t" + "umulh x3, x12, x12\n\t" + "mul x4, x12, x12\n\t" + "# A[1] * A[1]\n\t" + "mul x2, x13, x13\n\t" + "adds x5, x5, x3\n\t" + "umulh x3, x13, x13\n\t" + "adcs x6, x6, x2\n\t" + "# A[2] * A[2]\n\t" + "mul x2, x14, x14\n\t" + "adcs x7, x7, x3\n\t" + "umulh x3, x14, x14\n\t" + "adcs x8, x8, x2\n\t" + "# A[3] * A[3]\n\t" + "mul x2, x15, x15\n\t" + "adcs x9, x9, x3\n\t" + "umulh x3, x15, x15\n\t" + "adcs x10, x10, x2\n\t" + "adc x11, x11, x3\n\t" + "ldp x12, x13, [%[m], 0]\n\t" + "mov x14, 0xffffffffffffffff\n\t" + "mov x15, 0xffffffff00000000\n\t" + "# mu = a[0] * mp\n\t" + "mul x16, %[mp], x4\n\t" + "# a[0+0] += m[0] * mu\n\t" + "mul x2, x12, x16\n\t" + "adds x4, x4, x2\n\t" + "umulh x3, x12, x16\n\t" + "adcs x5, x5, x3\n\t" + "# a[0+2] += m[2] * mu\n\t" + "mul x2, x14, x16\n\t" + "adcs x6, x6, x2\n\t" + "umulh x3, x14, x16\n\t" + "adcs x7, x7, x3\n\t" + "adcs x8, x8, xzr\n\t" + "adc x17, xzr, xzr\n\t" + "# a[0+1] += m[1] * mu\n\t" + "mul x2, x13, x16\n\t" + "adds x5, x5, x2\n\t" + "umulh x3, x13, x16\n\t" + "adcs x6, x6, x3\n\t" + "# a[0+3] += m[3] * mu\n\t" + "mul x2, x15, x16\n\t" + "adcs x7, x7, x2\n\t" + "umulh x3, x15, x16\n\t" + "adcs x8, x8, x3\n\t" + "# mu = a[1] * mp\n\t" + "mul x16, %[mp], x5\n\t" + "adc x17, x17, xzr\n\t" + "# a[1+0] += m[0] * mu\n\t" + "mul x2, x12, x16\n\t" + "adds x5, x5, x2\n\t" + "umulh x3, x12, x16\n\t" + "adcs x6, x6, x3\n\t" + "# a[1+2] += m[2] * mu\n\t" + "mul x2, x14, x16\n\t" + "adcs x7, x7, x2\n\t" + "umulh x3, x14, x16\n\t" + "adcs x8, x8, x3\n\t" + "adcs x9, x9, x17\n\t" + "adc x17, xzr, xzr\n\t" + "# a[1+1] += m[1] * mu\n\t" + "mul x2, x13, x16\n\t" + "adds x6, x6, x2\n\t" + "umulh x3, x13, x16\n\t" + "adcs x7, x7, x3\n\t" + "# a[1+3] += m[3] * mu\n\t" + "mul x2, x15, x16\n\t" + "adcs x8, x8, x2\n\t" + "umulh x3, x15, x16\n\t" + "adcs x9, x9, x3\n\t" + "# mu = a[2] * mp\n\t" + "mul x16, %[mp], x6\n\t" + "adc x17, x17, xzr\n\t" + "# a[2+0] += m[0] * mu\n\t" + "mul x2, x12, x16\n\t" + "adds x6, x6, x2\n\t" + "umulh x3, x12, x16\n\t" + "adcs x7, x7, x3\n\t" + "# a[2+2] += m[2] * mu\n\t" + "mul x2, x14, x16\n\t" + "adcs x8, x8, x2\n\t" + "umulh x3, x14, x16\n\t" + "adcs x9, x9, x3\n\t" + "adcs x10, x10, x17\n\t" + "adc x17, xzr, xzr\n\t" + "# a[2+1] += m[1] * mu\n\t" + "mul x2, x13, x16\n\t" + "adds x7, x7, x2\n\t" + "umulh x3, x13, x16\n\t" + "adcs x8, x8, x3\n\t" + "# a[2+3] += m[3] * mu\n\t" + "mul x2, x15, x16\n\t" + "adcs x9, x9, x2\n\t" + "umulh x3, x15, x16\n\t" + "adcs x10, x10, x3\n\t" + "# mu = a[3] * mp\n\t" + "mul x16, %[mp], x7\n\t" + "adc x17, x17, xzr\n\t" + "# a[3+0] += m[0] * mu\n\t" + "mul x2, x12, x16\n\t" + "adds x7, x7, x2\n\t" + "umulh x3, x12, x16\n\t" + "adcs x8, x8, x3\n\t" + "# a[3+2] += m[2] * mu\n\t" + "mul x2, x14, x16\n\t" + "adcs x9, x9, x2\n\t" + "umulh x3, x14, x16\n\t" + "adcs x10, x10, x3\n\t" + "adcs x11, x11, x17\n\t" + "adc x17, xzr, xzr\n\t" + "# a[3+1] += m[1] * mu\n\t" + "mul x2, x13, x16\n\t" + "adds x8, x8, x2\n\t" + "umulh x3, x13, x16\n\t" + "adcs x9, x9, x3\n\t" + "# a[3+3] += m[3] * mu\n\t" + "mul x2, x15, x16\n\t" + "adcs x10, x10, x2\n\t" + "umulh x3, x15, x16\n\t" + "adcs x11, x11, x3\n\t" + "csel x12, x12, xzr, cs\n\t" + "csel x13, x13, xzr, cs\n\t" + "csel x14, x14, xzr, cs\n\t" + "csel x15, x15, xzr, cs\n\t" + "subs x8, x8, x12\n\t" + "sbcs x9, x9, x13\n\t" + "sbcs x10, x10, x14\n\t" + "stp x8, x9, [%[r], 0]\n\t" + "sbc x11, x11, x15\n\t" + "stp x10, x11, [%[r], 16]\n\t" + : + : [r] "r" (r), [a] "r" (a), [m] "r" (p256_order), + [mp] "r" (p256_mp_order) + : "memory", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "cc" + ); } #ifndef WOLFSSL_SP_SMALL @@ -40294,12 +40728,183 @@ static void sp_256_mont_sqr_order_4(sp_digit* r, const sp_digit* a) */ static void sp_256_mont_sqr_n_order_4(sp_digit* r, const sp_digit* a, int n) { - int i; - sp_256_mont_sqr_order_4(r, a); - for (i=1; i=112; i--) { - sp_256_mont_sqr_order_4(t2, t2); - if ((p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { - sp_256_mont_mul_order_4(t2, t2, a); - } - } - /* t2= a^ffffffff00000000ffffffffffffffffbce6f */ + + /* ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f */ + sp_256_mont_sqr_order_4(t2, t2); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t7); sp_256_mont_sqr_n_order_4(t2, t2, 4); sp_256_mont_mul_order_4(t2, t2, t3); - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84 */ - for (i=107; i>=64; i--) { - sp_256_mont_sqr_order_4(t2, t2); - if ((p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { - sp_256_mont_mul_order_4(t2, t2, a); - } - } - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f */ + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 3); + sp_256_mont_mul_order_4(t2, t2, t5); sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_mul_order_4(t2, t2, t5); + sp_256_mont_sqr_n_order_4(t2, t2, 3); sp_256_mont_mul_order_4(t2, t2, t3); - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2 */ - for (i=59; i>=32; i--) { - sp_256_mont_sqr_order_4(t2, t2); - if ((p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { - sp_256_mont_mul_order_4(t2, t2, a); - } - } - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2f */ - sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_sqr_n_order_4(t2, t2, 3); sp_256_mont_mul_order_4(t2, t2, t3); - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254 */ - for (i=27; i>=0; i--) { - sp_256_mont_sqr_order_4(t2, t2); - if ((p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { - sp_256_mont_mul_order_4(t2, t2, a); - } - } - /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632540 */ + sp_256_mont_sqr_n_order_4(t2, t2, 2); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t7); sp_256_mont_sqr_n_order_4(t2, t2, 4); - /* r = a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f */ - sp_256_mont_mul_order_4(r, t2, t3); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 6); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 2); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 6); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t7); + sp_256_mont_sqr_n_order_4(t2, t2, 4); + sp_256_mont_mul_order_4(t2, t2, t7); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t7); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t5); + sp_256_mont_sqr_n_order_4(t2, t2, 3); + sp_256_mont_mul_order_4(t2, t2, t3); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t15); + sp_256_mont_sqr_n_order_4(t2, t2, 2); + sp_256_mont_mul_order_4(t2, t2, t3); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t3); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t3); + sp_256_mont_sqr_n_order_4(t2, t2, 3); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 5); + sp_256_mont_mul_order_4(t2, t2, t5); + sp_256_mont_sqr_n_order_4(t2, t2, 2); + sp_256_mont_mul_order_4(t2, t2, a); + sp_256_mont_sqr_n_order_4(t2, t2, 6); + sp_256_mont_mul_order_4(r, t2, t15); + /* Multiplications: 31 */ #endif /* WOLFSSL_SP_SMALL */ } @@ -40526,7 +41163,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, sp_digit* e = NULL; sp_point_256* point = NULL; #else - sp_digit e[7 * 2 * 4]; + sp_digit e[10 * 2 * 4]; sp_point_256 point[1]; #endif sp_digit* x = NULL; @@ -40548,7 +41185,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, err = MEMORY_E; } if (err == MP_OKAY) { - e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 7 * 2 * 4, heap, + e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 10 * 2 * 4, heap, DYNAMIC_TYPE_ECC); if (e == NULL) err = MEMORY_E; @@ -40623,7 +41260,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng, if (e != NULL) #endif { - ForceZero(e, sizeof(sp_digit) * 7 * 2 * 4); + ForceZero(e, sizeof(sp_digit) * 10 * 2 * 4); #ifdef WOLFSSL_SP_SMALL_STACK XFREE(e, heap, DYNAMIC_TYPE_ECC); #endif @@ -40652,7 +41289,7 @@ typedef struct sp_ecc_sign_256_ctx { sp_digit x[2*4]; sp_digit k[2*4]; sp_digit r[2*4]; - sp_digit tmp[3 * 2*4]; + sp_digit tmp[6 * 2*4]; sp_point_256 point; sp_digit* s; sp_digit* kInv; @@ -40797,7 +41434,7 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W XMEMSET(ctx->x, 0, sizeof(sp_digit) * 2U * 4U); XMEMSET(ctx->k, 0, sizeof(sp_digit) * 2U * 4U); XMEMSET(ctx->r, 0, sizeof(sp_digit) * 2U * 4U); - XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 3U * 2U * 4U); + XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 6U * 2U * 4U); } return err; @@ -41428,6 +42065,50 @@ int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, #endif /* HAVE_ECC_VERIFY */ #ifdef HAVE_ECC_CHECK_KEY +/* Add two Montgomery form numbers (r = a + b % m). + * + * r Result of addition. + * a First number to add in Montgomery form. + * b Second number to add in Montgomery form. + * m Modulus (prime). + */ +static void sp_256_mont_add_4(sp_digit* r, const sp_digit* a, const sp_digit* b, + const sp_digit* m) +{ + __asm__ __volatile__ ( + "ldp x4, x5, [%[a], 0]\n\t" + "ldp x6, x7, [%[a], 16]\n\t" + "ldp x8, x9, [%[b], 0]\n\t" + "ldp x10, x11, [%[b], 16]\n\t" + "adds x4, x4, x8\n\t" + "adcs x5, x5, x9\n\t" + "adcs x6, x6, x10\n\t" + "adcs x7, x7, x11\n\t" + "csetm x14, cs\n\t" + "subs x4, x4, x14\n\t" + "lsr x12, x14, 32\n\t" + "sbcs x5, x5, x12\n\t" + "sub x13, xzr, x12\n\t" + "sbcs x6, x6, xzr\n\t" + "sbcs x7, x7, x13\n\t" + "sbc x13, xzr, xzr\n\t" + "sub x14, x14, x13\n\t" + "subs x4, x4, x14\n\t" + "lsr x12, x14, 32\n\t" + "sbcs x5, x5, x12\n\t" + "sub x13, xzr, x12\n\t" + "sbcs x6, x6, xzr\n\t" + "stp x4, x5, [%[r],0]\n\t" + "sbc x7, x7, x13\n\t" + "stp x6, x7, [%[r],16]\n\t" + : + : [r] "r" (r), [a] "r" (a), [b] "r" (b) + : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "cc" + ); + + (void)m; +} + /* Check that the x and y oridinates are a valid point on the curve. * * point EC point. @@ -42443,87 +43124,87 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) "ldp x19, x20, [%[a], 16]\n\t" "ldp x21, x22, [%[a], 32]\n\t" "# A[0] * A[1]\n\t" - "mul x6, x16, x17\n\t" - "umulh x7, x16, x17\n\t" + "mul x6, x16, x17\n\t" + "umulh x7, x16, x17\n\t" "# A[0] * A[2]\n\t" - "mul x4, x16, x19\n\t" - "umulh x5, x16, x19\n\t" - "adds x7, x7, x4\n\t" + "mul x4, x16, x19\n\t" + "umulh x5, x16, x19\n\t" + "adds x7, x7, x4\n\t" "# A[0] * A[3]\n\t" - "mul x4, x16, x20\n\t" - "adc x8, xzr, x5\n\t" - "umulh x5, x16, x20\n\t" - "adds x8, x8, x4\n\t" + "mul x4, x16, x20\n\t" + "adc x8, xzr, x5\n\t" + "umulh x5, x16, x20\n\t" + "adds x8, x8, x4\n\t" "# A[1] * A[2]\n\t" - "mul x4, x17, x19\n\t" - "adc x9, xzr, x5\n\t" - "umulh x5, x17, x19\n\t" - "adds x8, x8, x4\n\t" + "mul x4, x17, x19\n\t" + "adc x9, xzr, x5\n\t" + "umulh x5, x17, x19\n\t" + "adds x8, x8, x4\n\t" "# A[0] * A[4]\n\t" - "mul x4, x16, x21\n\t" - "adcs x9, x9, x5\n\t" - "umulh x5, x16, x21\n\t" - "adc x10, xzr, xzr\n\t" - "adds x9, x9, x4\n\t" + "mul x4, x16, x21\n\t" + "adcs x9, x9, x5\n\t" + "umulh x5, x16, x21\n\t" + "adc x10, xzr, xzr\n\t" + "adds x9, x9, x4\n\t" "# A[1] * A[3]\n\t" - "mul x4, x17, x20\n\t" - "adc x10, x10, x5\n\t" - "umulh x5, x17, x20\n\t" - "adds x9, x9, x4\n\t" + "mul x4, x17, x20\n\t" + "adc x10, x10, x5\n\t" + "umulh x5, x17, x20\n\t" + "adds x9, x9, x4\n\t" "# A[0] * A[5]\n\t" - "mul x4, x16, x22\n\t" - "adcs x10, x10, x5\n\t" - "umulh x5, x16, x22\n\t" - "adc x11, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x16, x22\n\t" + "adcs x10, x10, x5\n\t" + "umulh x5, x16, x22\n\t" + "adc x11, xzr, xzr\n\t" + "adds x10, x10, x4\n\t" "# A[1] * A[4]\n\t" - "mul x4, x17, x21\n\t" - "adc x11, x11, x5\n\t" - "umulh x5, x17, x21\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x17, x21\n\t" + "adc x11, x11, x5\n\t" + "umulh x5, x17, x21\n\t" + "adds x10, x10, x4\n\t" "# A[2] * A[3]\n\t" - "mul x4, x19, x20\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x19, x20\n\t" - "adc x12, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x19, x20\n\t" + "adcs x11, x11, x5\n\t" + "umulh x5, x19, x20\n\t" + "adc x12, xzr, xzr\n\t" + "adds x10, x10, x4\n\t" "# A[1] * A[5]\n\t" - "mul x4, x17, x22\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x17, x22\n\t" - "adc x12, x12, xzr\n\t" - "adds x11, x11, x4\n\t" + "mul x4, x17, x22\n\t" + "adcs x11, x11, x5\n\t" + "umulh x5, x17, x22\n\t" + "adc x12, x12, xzr\n\t" + "adds x11, x11, x4\n\t" "# A[2] * A[4]\n\t" - "mul x4, x19, x21\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x19, x21\n\t" - "adc x13, xzr, xzr\n\t" - "adds x11, x11, x4\n\t" + "mul x4, x19, x21\n\t" + "adcs x12, x12, x5\n\t" + "umulh x5, x19, x21\n\t" + "adc x13, xzr, xzr\n\t" + "adds x11, x11, x4\n\t" "# A[2] * A[5]\n\t" - "mul x4, x19, x22\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x19, x22\n\t" - "adc x13, x13, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x19, x22\n\t" + "adcs x12, x12, x5\n\t" + "umulh x5, x19, x22\n\t" + "adc x13, x13, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[3] * A[4]\n\t" - "mul x4, x20, x21\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x20, x21\n\t" - "adc x14, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x20, x21\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x20, x21\n\t" + "adc x14, xzr, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[3] * A[5]\n\t" - "mul x4, x20, x22\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x20, x22\n\t" - "adc x14, x14, xzr\n\t" - "adds x13, x13, x4\n\t" + "mul x4, x20, x22\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x20, x22\n\t" + "adc x14, x14, xzr\n\t" + "adds x13, x13, x4\n\t" "# A[4] * A[5]\n\t" - "mul x4, x21, x22\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x21, x22\n\t" - "adc x15, xzr, xzr\n\t" - "adds x14, x14, x4\n\t" - "adc x15, x15, x5\n\t" + "mul x4, x21, x22\n\t" + "adcs x14, x14, x5\n\t" + "umulh x5, x21, x22\n\t" + "adc x15, xzr, xzr\n\t" + "adds x14, x14, x4\n\t" + "adc x15, x15, x5\n\t" "# Double\n\t" "adds x6, x6, x6\n\t" "adcs x7, x7, x7\n\t" @@ -42535,34 +43216,34 @@ static void sp_384_sqr_6(sp_digit* r, const sp_digit* a) "adcs x13, x13, x13\n\t" "adcs x14, x14, x14\n\t" "# A[0] * A[0]\n\t" - "mul x5, x16, x16\n\t" + "mul x5, x16, x16\n\t" "adcs x15, x15, x15\n\t" - "umulh x2, x16, x16\n\t" + "umulh x2, x16, x16\n\t" "cset x16, cs\n\t" "# A[1] * A[1]\n\t" - "mul x3, x17, x17\n\t" + "mul x3, x17, x17\n\t" "adds x6, x6, x2\n\t" - "umulh x4, x17, x17\n\t" + "umulh x4, x17, x17\n\t" "adcs x7, x7, x3\n\t" "# A[2] * A[2]\n\t" - "mul x2, x19, x19\n\t" + "mul x2, x19, x19\n\t" "adcs x8, x8, x4\n\t" - "umulh x3, x19, x19\n\t" + "umulh x3, x19, x19\n\t" "adcs x9, x9, x2\n\t" "# A[3] * A[3]\n\t" - "mul x4, x20, x20\n\t" + "mul x4, x20, x20\n\t" "adcs x10, x10, x3\n\t" - "umulh x2, x20, x20\n\t" + "umulh x2, x20, x20\n\t" "adcs x11, x11, x4\n\t" "# A[4] * A[4]\n\t" - "mul x3, x21, x21\n\t" + "mul x3, x21, x21\n\t" "adcs x12, x12, x2\n\t" - "umulh x4, x21, x21\n\t" + "umulh x4, x21, x21\n\t" "adcs x13, x13, x3\n\t" "# A[5] * A[5]\n\t" - "mul x2, x22, x22\n\t" + "mul x2, x22, x22\n\t" "adcs x14, x14, x4\n\t" - "umulh x3, x22, x22\n\t" + "umulh x3, x22, x22\n\t" "adcs x15, x15, x2\n\t" "stp x5, x6, [%[r], 0]\n\t" "adc x16, x16, x3\n\t" @@ -42606,7 +43287,7 @@ static sp_digit sp_384_add_6(sp_digit* r, const sp_digit* a, "adcs x4, x4, x8\n\t" "str x3, [%[r], 32]\n\t" "str x4, [%[r], 40]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -42773,14 +43454,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -43193,7 +43874,7 @@ SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x16, x17, x6\n\t" "ldr x17, [%[a], 48]\n\t" "adcs x17, x17, x8\n\t" @@ -43695,7 +44376,6 @@ static void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_384_cond_add_6(r, r, m, o); } -#define sp_384_mont_sub_lower_6 sp_384_mont_sub_6 static void sp_384_rshift1_6(sp_digit* r, const sp_digit* a) { __asm__ __volatile__ ( @@ -43786,7 +44466,7 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_sub_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -43908,7 +44588,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -43933,8 +44613,6 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_384_mont_dbl_lower_6 sp_384_mont_dbl_6 -#define sp_384_mont_tpl_lower_6 sp_384_mont_tpl_6 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -43973,7 +44651,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -43982,8 +44660,8 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -44003,7 +44681,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -44012,8 +44690,8 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -44061,12 +44739,12 @@ static int sp_384_iszero_6(const sp_digit* a) static void sp_384_proj_point_add_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*6; + sp_digit* t2 = t + 4*6; + sp_digit* t3 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); @@ -44088,17 +44766,9 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_6(t2, t2, t1, p384_mod); @@ -44117,20 +44787,31 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_mont_dbl_6(t3, y, p384_mod); sp_384_mont_sub_6(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_sub_6(y, y, x, p384_mod); sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t5, p384_mod); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -44176,12 +44857,12 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*6; - ctx->t3 = t + 4*6; - ctx->t4 = t + 6*6; - ctx->t5 = t + 8*6; - ctx->t6 = t + 10*6; + ctx->t6 = t; + ctx->t1 = t + 2*6; + ctx->t2 = t + 4*6; + ctx->t3 = t + 6*6; + ctx->t4 = t + 8*6; + ctx->t5 = t + 10*6; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -44288,7 +44969,7 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -44301,22 +44982,28 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -44375,7 +45062,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -44385,8 +45072,8 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; @@ -44474,8 +45161,8 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_384_mont_sub_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_lower_6(ys, ya, xs, p384_mod); - sp_384_mont_sub_lower_6(ya, ya, xa, p384_mod); + sp_384_mont_sub_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_6(ya, ya, xa, p384_mod); sp_384_mont_mul_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_6(ys, ys, t6, p384_mod, p384_mp_mod); @@ -44567,46 +45254,65 @@ static void sp_384_get_point_33_6(sp_point_384* r, const sp_point_384* table, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; + sp_digit z0 = 0; + sp_digit z1 = 0; + sp_digit z2 = 0; + sp_digit z3 = 0; + sp_digit z4 = 0; + sp_digit z5 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; - r->z[0] = 0; - r->z[1] = 0; - r->z[2] = 0; - r->z[3] = 0; - r->z[4] = 0; - r->z[5] = 0; for (i = 1; i < 33; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; - r->z[0] |= mask & table[i].z[0]; - r->z[1] |= mask & table[i].z[1]; - r->z[2] |= mask & table[i].z[2]; - r->z[3] |= mask & table[i].z[3]; - r->z[4] |= mask & table[i].z[4]; - r->z[5] |= mask & table[i].z[5]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; + z0 |= mask & table[i].z[0]; + z1 |= mask & table[i].z[1]; + z2 |= mask & table[i].z[2]; + z3 |= mask & table[i].z[3]; + z4 |= mask & table[i].z[4]; + z5 |= mask & table[i].z[5]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; + r->z[0] = z0; + r->z[1] = z1; + r->z[2] = z2; + r->z[3] = z3; + r->z[4] = z4; + r->z[5] = z5; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -44649,7 +45355,7 @@ static int sp_384_ecc_mulmod_win_add_sub_6(sp_point_384* r, const sp_point_384* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * + t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -44754,15 +45460,12 @@ static int sp_384_ecc_mulmod_win_add_sub_6(sp_point_384* r, const sp_point_384* return err; } -#ifndef WC_NO_CACHE_RESISTANT /* A table entry for pre-computed points. */ typedef struct sp_table_entry_384 { sp_digit x[6]; sp_digit y[6]; } sp_table_entry_384; -#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) -#endif /* FP_ECC | WOLFSSL_SP_SMALL */ /* Add two Montgomery form projective points. The second point has a q value of * one. * Only the first point can be the same pointer as the result point. @@ -44775,12 +45478,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*6; + sp_digit* t6 = t + 4*6; + sp_digit* t1 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -44796,13 +45499,9 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_6(t2, t2, p->x, p384_mod); @@ -44811,36 +45510,46 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_6(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(x, t1, t5, p384_mod); - sp_384_mont_dbl_6(t1, t3, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_sqr_6(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(t2, t2, t1, p384_mod); + sp_384_mont_dbl_6(t5, t3, p384_mod); + sp_384_mont_sub_6(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_6(t3, t3, x, p384_mod); + sp_384_mont_sub_6(t3, t3, x, p384_mod); sp_384_mont_mul_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(y, t3, t5, p384_mod); + sp_384_mont_mul_6(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } +#ifndef WC_NO_CACHE_RESISTANT +#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) +#endif /* FP_ECC | WOLFSSL_SP_SMALL */ #ifdef FP_ECC /* Convert the projective point to affine. * Ordinates are in Montgomery form. @@ -44968,34 +45677,47 @@ static void sp_384_get_entry_64_6(sp_point_384* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; for (i = 1; i < 64; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -45384,34 +46106,47 @@ static void sp_384_get_entry_256_6(sp_point_384* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; for (i = 1; i < 256; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -45764,7 +46499,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -47570,34 +48305,47 @@ static void sp_384_get_entry_65_6(sp_point_384* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; for (i = 1; i < 65; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; } #endif /* !WC_NO_CACHE_RESISTANT */ static const sp_table_entry_384 p384_table[3575] = { @@ -65580,7 +66328,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -65827,7 +66575,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -65835,7 +66583,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -66242,7 +66990,7 @@ static void sp_384_mul_d_6(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -69156,7 +69904,7 @@ static sp_digit sp_521_add_9(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" "adds %[c], %[c], #-1\n\t" @@ -69164,7 +69912,7 @@ static sp_digit sp_521_add_9(sp_digit* r, const sp_digit* a, "ldr x7, [%[b]], #8\n\t" "adcs x3, x3, x7\n\t" "str x3, [%[r]], #8\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc" @@ -69208,7 +69956,7 @@ static sp_digit sp_521_add_9(sp_digit* r, const sp_digit* a, "ldr x7, [%[b], 64]\n\t" "adcs x3, x3, x7\n\t" "str x3, [%[r], 64]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -69828,7 +70576,7 @@ static void sp_521_mul_d_9(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -70115,14 +70863,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -71584,7 +72332,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m, "umulh x8, x11, x9\n\t" "adds x5, x5, x7\n\t" "adcs x8, x8, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x21, x22, x5\n\t" "ldr x22, [%[a], 72]\n\t" "adcs x22, x22, x8\n\t" @@ -71727,9 +72475,11 @@ static void sp_521_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b, "stp x10, x11, [%[r], 48]\n\t" "str x12, [%[r], 64]\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "cc" ); + + (void)m; } /* Double a Montgomery form number (r = a + a % m). @@ -71772,9 +72522,11 @@ static void sp_521_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) "stp x10, x11, [%[r], 48]\n\t" "str x12, [%[r], 64]\n\t" : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) + : [r] "r" (r), [a] "r" (a) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc" ); + + (void)m; } /* Triple a Montgomery form number (r = a + a + a % m). @@ -71826,9 +72578,11 @@ static void sp_521_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m) "stp x20, x21, [%[r], 48]\n\t" "str x22, [%[r], 64]\n\t" : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) + : [r] "r" (r), [a] "r" (a) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "cc" ); + + (void)m; } /* Subtract two Montgomery form numbers (r = a - b % m). @@ -71879,12 +72633,13 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, "stp x10, x11, [%[r], 48]\n\t" "str x12, [%[r], 64]\n\t" : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) + : [r] "r" (r), [a] "r" (a), [b] "r" (b) : "memory", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "cc" ); + + (void)m; } -#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -72082,7 +72837,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -72204,7 +72959,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -72229,8 +72984,6 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 -#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -72269,7 +73022,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -72278,8 +73031,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -72299,7 +73052,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -72308,8 +73061,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -72359,12 +73112,12 @@ static int sp_521_iszero_9(const sp_digit* a) static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*9; + sp_digit* t2 = t + 4*9; + sp_digit* t3 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); @@ -72386,17 +73139,9 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); @@ -72415,20 +73160,31 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_mont_dbl_9(t3, y, p521_mod); sp_521_mont_sub_9(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -72474,12 +73230,12 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*9; - ctx->t3 = t + 4*9; - ctx->t4 = t + 6*9; - ctx->t5 = t + 8*9; - ctx->t6 = t + 10*9; + ctx->t6 = t; + ctx->t1 = t + 2*9; + ctx->t2 = t + 4*9; + ctx->t3 = t + 6*9; + ctx->t4 = t + 8*9; + ctx->t5 = t + 10*9; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -72586,7 +73342,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -72599,22 +73355,28 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -72673,7 +73435,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -72683,8 +73445,8 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; @@ -72772,8 +73534,8 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); - sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_sub_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_9(ya, ya, xa, p521_mod); sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); @@ -72865,64 +73627,92 @@ static void sp_521_get_point_33_9(sp_point_521* r, const sp_point_521* table, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit x6 = 0; + sp_digit x7 = 0; + sp_digit x8 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; + sp_digit y6 = 0; + sp_digit y7 = 0; + sp_digit y8 = 0; + sp_digit z0 = 0; + sp_digit z1 = 0; + sp_digit z2 = 0; + sp_digit z3 = 0; + sp_digit z4 = 0; + sp_digit z5 = 0; + sp_digit z6 = 0; + sp_digit z7 = 0; + sp_digit z8 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->x[6] = 0; - r->x[7] = 0; - r->x[8] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; - r->y[6] = 0; - r->y[7] = 0; - r->y[8] = 0; - r->z[0] = 0; - r->z[1] = 0; - r->z[2] = 0; - r->z[3] = 0; - r->z[4] = 0; - r->z[5] = 0; - r->z[6] = 0; - r->z[7] = 0; - r->z[8] = 0; for (i = 1; i < 33; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->x[6] |= mask & table[i].x[6]; - r->x[7] |= mask & table[i].x[7]; - r->x[8] |= mask & table[i].x[8]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; - r->y[6] |= mask & table[i].y[6]; - r->y[7] |= mask & table[i].y[7]; - r->y[8] |= mask & table[i].y[8]; - r->z[0] |= mask & table[i].z[0]; - r->z[1] |= mask & table[i].z[1]; - r->z[2] |= mask & table[i].z[2]; - r->z[3] |= mask & table[i].z[3]; - r->z[4] |= mask & table[i].z[4]; - r->z[5] |= mask & table[i].z[5]; - r->z[6] |= mask & table[i].z[6]; - r->z[7] |= mask & table[i].z[7]; - r->z[8] |= mask & table[i].z[8]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + x6 |= mask & table[i].x[6]; + x7 |= mask & table[i].x[7]; + x8 |= mask & table[i].x[8]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; + y6 |= mask & table[i].y[6]; + y7 |= mask & table[i].y[7]; + y8 |= mask & table[i].y[8]; + z0 |= mask & table[i].z[0]; + z1 |= mask & table[i].z[1]; + z2 |= mask & table[i].z[2]; + z3 |= mask & table[i].z[3]; + z4 |= mask & table[i].z[4]; + z5 |= mask & table[i].z[5]; + z6 |= mask & table[i].z[6]; + z7 |= mask & table[i].z[7]; + z8 |= mask & table[i].z[8]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->x[6] = x6; + r->x[7] = x7; + r->x[8] = x8; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; + r->y[6] = y6; + r->y[7] = y7; + r->y[8] = y8; + r->z[0] = z0; + r->z[1] = z1; + r->z[2] = z2; + r->z[3] = z3; + r->z[4] = z4; + r->z[5] = z5; + r->z[6] = z6; + r->z[7] = z7; + r->z[8] = z8; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -72965,7 +73755,7 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * + t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -73070,15 +73860,12 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* return err; } -#ifndef WC_NO_CACHE_RESISTANT /* A table entry for pre-computed points. */ typedef struct sp_table_entry_521 { sp_digit x[9]; sp_digit y[9]; } sp_table_entry_521; -#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) -#endif /* FP_ECC | WOLFSSL_SP_SMALL */ /* Add two Montgomery form projective points. The second point has a q value of * one. * Only the first point can be the same pointer as the result point. @@ -73091,12 +73878,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*9; + sp_digit* t6 = t + 4*9; + sp_digit* t1 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -73112,13 +73899,9 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_9(t2, t2, p->x, p521_mod); @@ -73127,36 +73910,46 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, t1, t5, p521_mod); - sp_521_mont_dbl_9(t1, t3, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_sqr_9(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(t2, t2, t1, p521_mod); + sp_521_mont_dbl_9(t5, t3, p521_mod); + sp_521_mont_sub_9(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); + sp_521_mont_sub_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, t3, t5, p521_mod); + sp_521_mont_mul_9(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } +#ifndef WC_NO_CACHE_RESISTANT +#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL) +#endif /* FP_ECC | WOLFSSL_SP_SMALL */ #ifdef FP_ECC /* Convert the projective point to affine. * Ordinates are in Montgomery form. @@ -73284,46 +74077,65 @@ static void sp_521_get_entry_64_9(sp_point_521* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit x6 = 0; + sp_digit x7 = 0; + sp_digit x8 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; + sp_digit y6 = 0; + sp_digit y7 = 0; + sp_digit y8 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->x[6] = 0; - r->x[7] = 0; - r->x[8] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; - r->y[6] = 0; - r->y[7] = 0; - r->y[8] = 0; for (i = 1; i < 64; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->x[6] |= mask & table[i].x[6]; - r->x[7] |= mask & table[i].x[7]; - r->x[8] |= mask & table[i].x[8]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; - r->y[6] |= mask & table[i].y[6]; - r->y[7] |= mask & table[i].y[7]; - r->y[8] |= mask & table[i].y[8]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + x6 |= mask & table[i].x[6]; + x7 |= mask & table[i].x[7]; + x8 |= mask & table[i].x[8]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; + y6 |= mask & table[i].y[6]; + y7 |= mask & table[i].y[7]; + y8 |= mask & table[i].y[8]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->x[6] = x6; + r->x[7] = x7; + r->x[8] = x8; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; + r->y[6] = y6; + r->y[7] = y7; + r->y[8] = y8; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -73712,46 +74524,65 @@ static void sp_521_get_entry_256_9(sp_point_521* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit x6 = 0; + sp_digit x7 = 0; + sp_digit x8 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; + sp_digit y6 = 0; + sp_digit y7 = 0; + sp_digit y8 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->x[6] = 0; - r->x[7] = 0; - r->x[8] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; - r->y[6] = 0; - r->y[7] = 0; - r->y[8] = 0; for (i = 1; i < 256; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->x[6] |= mask & table[i].x[6]; - r->x[7] |= mask & table[i].x[7]; - r->x[8] |= mask & table[i].x[8]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; - r->y[6] |= mask & table[i].y[6]; - r->y[7] |= mask & table[i].y[7]; - r->y[8] |= mask & table[i].y[8]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + x6 |= mask & table[i].x[6]; + x7 |= mask & table[i].x[7]; + x8 |= mask & table[i].x[8]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; + y6 |= mask & table[i].y[6]; + y7 |= mask & table[i].y[7]; + y8 |= mask & table[i].y[8]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->x[6] = x6; + r->x[7] = x7; + r->x[8] = x8; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; + r->y[6] = y6; + r->y[7] = y7; + r->y[8] = y8; } #endif /* !WC_NO_CACHE_RESISTANT */ /* Multiply the point by the scalar and return the result. @@ -74104,7 +74935,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -76546,46 +77377,65 @@ static void sp_521_get_entry_65_9(sp_point_521* r, { int i; sp_digit mask; + sp_digit x0 = 0; + sp_digit x1 = 0; + sp_digit x2 = 0; + sp_digit x3 = 0; + sp_digit x4 = 0; + sp_digit x5 = 0; + sp_digit x6 = 0; + sp_digit x7 = 0; + sp_digit x8 = 0; + sp_digit y0 = 0; + sp_digit y1 = 0; + sp_digit y2 = 0; + sp_digit y3 = 0; + sp_digit y4 = 0; + sp_digit y5 = 0; + sp_digit y6 = 0; + sp_digit y7 = 0; + sp_digit y8 = 0; - r->x[0] = 0; - r->x[1] = 0; - r->x[2] = 0; - r->x[3] = 0; - r->x[4] = 0; - r->x[5] = 0; - r->x[6] = 0; - r->x[7] = 0; - r->x[8] = 0; - r->y[0] = 0; - r->y[1] = 0; - r->y[2] = 0; - r->y[3] = 0; - r->y[4] = 0; - r->y[5] = 0; - r->y[6] = 0; - r->y[7] = 0; - r->y[8] = 0; for (i = 1; i < 65; i++) { mask = 0 - (i == idx); - r->x[0] |= mask & table[i].x[0]; - r->x[1] |= mask & table[i].x[1]; - r->x[2] |= mask & table[i].x[2]; - r->x[3] |= mask & table[i].x[3]; - r->x[4] |= mask & table[i].x[4]; - r->x[5] |= mask & table[i].x[5]; - r->x[6] |= mask & table[i].x[6]; - r->x[7] |= mask & table[i].x[7]; - r->x[8] |= mask & table[i].x[8]; - r->y[0] |= mask & table[i].y[0]; - r->y[1] |= mask & table[i].y[1]; - r->y[2] |= mask & table[i].y[2]; - r->y[3] |= mask & table[i].y[3]; - r->y[4] |= mask & table[i].y[4]; - r->y[5] |= mask & table[i].y[5]; - r->y[6] |= mask & table[i].y[6]; - r->y[7] |= mask & table[i].y[7]; - r->y[8] |= mask & table[i].y[8]; + x0 |= mask & table[i].x[0]; + x1 |= mask & table[i].x[1]; + x2 |= mask & table[i].x[2]; + x3 |= mask & table[i].x[3]; + x4 |= mask & table[i].x[4]; + x5 |= mask & table[i].x[5]; + x6 |= mask & table[i].x[6]; + x7 |= mask & table[i].x[7]; + x8 |= mask & table[i].x[8]; + y0 |= mask & table[i].y[0]; + y1 |= mask & table[i].y[1]; + y2 |= mask & table[i].y[2]; + y3 |= mask & table[i].y[3]; + y4 |= mask & table[i].y[4]; + y5 |= mask & table[i].y[5]; + y6 |= mask & table[i].y[6]; + y7 |= mask & table[i].y[7]; + y8 |= mask & table[i].y[8]; } + + r->x[0] = x0; + r->x[1] = x1; + r->x[2] = x2; + r->x[3] = x3; + r->x[4] = x4; + r->x[5] = x5; + r->x[6] = x6; + r->x[7] = x7; + r->x[8] = x8; + r->y[0] = y0; + r->y[1] = y1; + r->y[2] = y2; + r->y[3] = y3; + r->y[4] = y4; + r->y[5] = y5; + r->y[6] = y6; + r->y[7] = y7; + r->y[8] = y8; } #endif /* !WC_NO_CACHE_RESISTANT */ static const sp_table_entry_521 p521_table[4875] = { @@ -110628,7 +111478,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -110884,7 +111734,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -110892,7 +111742,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -113256,165 +114106,165 @@ static void sp_1024_sqr_8(sp_digit* r, const sp_digit* a) "ldp x25, x26, [%[a], 32]\n\t" "ldp x27, x28, [%[a], 48]\n\t" "# A[0] * A[1]\n\t" - "mul x6, x21, x22\n\t" - "umulh x7, x21, x22\n\t" + "mul x6, x21, x22\n\t" + "umulh x7, x21, x22\n\t" "# A[0] * A[2]\n\t" - "mul x4, x21, x23\n\t" - "umulh x5, x21, x23\n\t" - "adds x7, x7, x4\n\t" + "mul x4, x21, x23\n\t" + "umulh x5, x21, x23\n\t" + "adds x7, x7, x4\n\t" "# A[0] * A[3]\n\t" - "mul x4, x21, x24\n\t" - "adc x8, xzr, x5\n\t" - "umulh x5, x21, x24\n\t" - "adds x8, x8, x4\n\t" + "mul x4, x21, x24\n\t" + "adc x8, xzr, x5\n\t" + "umulh x5, x21, x24\n\t" + "adds x8, x8, x4\n\t" "# A[1] * A[2]\n\t" - "mul x4, x22, x23\n\t" - "adc x9, xzr, x5\n\t" - "umulh x5, x22, x23\n\t" - "adds x8, x8, x4\n\t" + "mul x4, x22, x23\n\t" + "adc x9, xzr, x5\n\t" + "umulh x5, x22, x23\n\t" + "adds x8, x8, x4\n\t" "# A[0] * A[4]\n\t" - "mul x4, x21, x25\n\t" - "adcs x9, x9, x5\n\t" - "umulh x5, x21, x25\n\t" - "adc x10, xzr, xzr\n\t" - "adds x9, x9, x4\n\t" + "mul x4, x21, x25\n\t" + "adcs x9, x9, x5\n\t" + "umulh x5, x21, x25\n\t" + "adc x10, xzr, xzr\n\t" + "adds x9, x9, x4\n\t" "# A[1] * A[3]\n\t" - "mul x4, x22, x24\n\t" - "adc x10, x10, x5\n\t" - "umulh x5, x22, x24\n\t" - "adds x9, x9, x4\n\t" + "mul x4, x22, x24\n\t" + "adc x10, x10, x5\n\t" + "umulh x5, x22, x24\n\t" + "adds x9, x9, x4\n\t" "# A[0] * A[5]\n\t" - "mul x4, x21, x26\n\t" - "adcs x10, x10, x5\n\t" - "umulh x5, x21, x26\n\t" - "adc x11, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x21, x26\n\t" + "adcs x10, x10, x5\n\t" + "umulh x5, x21, x26\n\t" + "adc x11, xzr, xzr\n\t" + "adds x10, x10, x4\n\t" "# A[1] * A[4]\n\t" - "mul x4, x22, x25\n\t" - "adc x11, x11, x5\n\t" - "umulh x5, x22, x25\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x22, x25\n\t" + "adc x11, x11, x5\n\t" + "umulh x5, x22, x25\n\t" + "adds x10, x10, x4\n\t" "# A[2] * A[3]\n\t" - "mul x4, x23, x24\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x23, x24\n\t" - "adc x12, xzr, xzr\n\t" - "adds x10, x10, x4\n\t" + "mul x4, x23, x24\n\t" + "adcs x11, x11, x5\n\t" + "umulh x5, x23, x24\n\t" + "adc x12, xzr, xzr\n\t" + "adds x10, x10, x4\n\t" "# A[0] * A[6]\n\t" - "mul x4, x21, x27\n\t" - "adcs x11, x11, x5\n\t" - "umulh x5, x21, x27\n\t" - "adc x12, x12, xzr\n\t" - "adds x11, x11, x4\n\t" + "mul x4, x21, x27\n\t" + "adcs x11, x11, x5\n\t" + "umulh x5, x21, x27\n\t" + "adc x12, x12, xzr\n\t" + "adds x11, x11, x4\n\t" "# A[1] * A[5]\n\t" - "mul x4, x22, x26\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x22, x26\n\t" - "adc x13, xzr, xzr\n\t" - "adds x11, x11, x4\n\t" + "mul x4, x22, x26\n\t" + "adcs x12, x12, x5\n\t" + "umulh x5, x22, x26\n\t" + "adc x13, xzr, xzr\n\t" + "adds x11, x11, x4\n\t" "# A[2] * A[4]\n\t" - "mul x4, x23, x25\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x23, x25\n\t" - "adc x13, x13, xzr\n\t" - "adds x11, x11, x4\n\t" + "mul x4, x23, x25\n\t" + "adcs x12, x12, x5\n\t" + "umulh x5, x23, x25\n\t" + "adc x13, x13, xzr\n\t" + "adds x11, x11, x4\n\t" "# A[0] * A[7]\n\t" - "mul x4, x21, x28\n\t" - "adcs x12, x12, x5\n\t" - "umulh x5, x21, x28\n\t" - "adc x13, x13, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x21, x28\n\t" + "adcs x12, x12, x5\n\t" + "umulh x5, x21, x28\n\t" + "adc x13, x13, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[1] * A[6]\n\t" - "mul x4, x22, x27\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x22, x27\n\t" - "adc x14, xzr, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x22, x27\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x22, x27\n\t" + "adc x14, xzr, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[2] * A[5]\n\t" - "mul x4, x23, x26\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x23, x26\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x23, x26\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x23, x26\n\t" + "adc x14, x14, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[3] * A[4]\n\t" - "mul x4, x24, x25\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x24, x25\n\t" - "adc x14, x14, xzr\n\t" - "adds x12, x12, x4\n\t" + "mul x4, x24, x25\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x24, x25\n\t" + "adc x14, x14, xzr\n\t" + "adds x12, x12, x4\n\t" "# A[1] * A[7]\n\t" - "mul x4, x22, x28\n\t" - "adcs x13, x13, x5\n\t" - "umulh x5, x22, x28\n\t" - "adc x14, x14, xzr\n\t" - "adds x13, x13, x4\n\t" + "mul x4, x22, x28\n\t" + "adcs x13, x13, x5\n\t" + "umulh x5, x22, x28\n\t" + "adc x14, x14, xzr\n\t" + "adds x13, x13, x4\n\t" "# A[2] * A[6]\n\t" - "mul x4, x23, x27\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x23, x27\n\t" - "adc x15, xzr, xzr\n\t" - "adds x13, x13, x4\n\t" + "mul x4, x23, x27\n\t" + "adcs x14, x14, x5\n\t" + "umulh x5, x23, x27\n\t" + "adc x15, xzr, xzr\n\t" + "adds x13, x13, x4\n\t" "# A[3] * A[5]\n\t" - "mul x4, x24, x26\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x24, x26\n\t" - "adc x15, x15, xzr\n\t" - "adds x13, x13, x4\n\t" + "mul x4, x24, x26\n\t" + "adcs x14, x14, x5\n\t" + "umulh x5, x24, x26\n\t" + "adc x15, x15, xzr\n\t" + "adds x13, x13, x4\n\t" "# A[2] * A[7]\n\t" - "mul x4, x23, x28\n\t" - "adcs x14, x14, x5\n\t" - "umulh x5, x23, x28\n\t" - "adc x15, x15, xzr\n\t" - "adds x14, x14, x4\n\t" + "mul x4, x23, x28\n\t" + "adcs x14, x14, x5\n\t" + "umulh x5, x23, x28\n\t" + "adc x15, x15, xzr\n\t" + "adds x14, x14, x4\n\t" "# A[3] * A[6]\n\t" - "mul x4, x24, x27\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x24, x27\n\t" - "adc x16, xzr, xzr\n\t" - "adds x14, x14, x4\n\t" + "mul x4, x24, x27\n\t" + "adcs x15, x15, x5\n\t" + "umulh x5, x24, x27\n\t" + "adc x16, xzr, xzr\n\t" + "adds x14, x14, x4\n\t" "# A[4] * A[5]\n\t" - "mul x4, x25, x26\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x25, x26\n\t" - "adc x16, x16, xzr\n\t" - "adds x14, x14, x4\n\t" + "mul x4, x25, x26\n\t" + "adcs x15, x15, x5\n\t" + "umulh x5, x25, x26\n\t" + "adc x16, x16, xzr\n\t" + "adds x14, x14, x4\n\t" "# A[3] * A[7]\n\t" - "mul x4, x24, x28\n\t" - "adcs x15, x15, x5\n\t" - "umulh x5, x24, x28\n\t" - "adc x16, x16, xzr\n\t" - "adds x15, x15, x4\n\t" + "mul x4, x24, x28\n\t" + "adcs x15, x15, x5\n\t" + "umulh x5, x24, x28\n\t" + "adc x16, x16, xzr\n\t" + "adds x15, x15, x4\n\t" "# A[4] * A[6]\n\t" - "mul x4, x25, x27\n\t" - "adcs x16, x16, x5\n\t" - "umulh x5, x25, x27\n\t" - "adc x17, xzr, xzr\n\t" - "adds x15, x15, x4\n\t" + "mul x4, x25, x27\n\t" + "adcs x16, x16, x5\n\t" + "umulh x5, x25, x27\n\t" + "adc x17, xzr, xzr\n\t" + "adds x15, x15, x4\n\t" "# A[4] * A[7]\n\t" - "mul x4, x25, x28\n\t" - "adcs x16, x16, x5\n\t" - "umulh x5, x25, x28\n\t" - "adc x17, x17, xzr\n\t" - "adds x16, x16, x4\n\t" + "mul x4, x25, x28\n\t" + "adcs x16, x16, x5\n\t" + "umulh x5, x25, x28\n\t" + "adc x17, x17, xzr\n\t" + "adds x16, x16, x4\n\t" "# A[5] * A[6]\n\t" - "mul x4, x26, x27\n\t" - "adcs x17, x17, x5\n\t" - "umulh x5, x26, x27\n\t" - "adc x19, xzr, xzr\n\t" - "adds x16, x16, x4\n\t" + "mul x4, x26, x27\n\t" + "adcs x17, x17, x5\n\t" + "umulh x5, x26, x27\n\t" + "adc x19, xzr, xzr\n\t" + "adds x16, x16, x4\n\t" "# A[5] * A[7]\n\t" - "mul x4, x26, x28\n\t" - "adcs x17, x17, x5\n\t" - "umulh x5, x26, x28\n\t" - "adc x19, x19, xzr\n\t" - "adds x17, x17, x4\n\t" + "mul x4, x26, x28\n\t" + "adcs x17, x17, x5\n\t" + "umulh x5, x26, x28\n\t" + "adc x19, x19, xzr\n\t" + "adds x17, x17, x4\n\t" "# A[6] * A[7]\n\t" - "mul x4, x27, x28\n\t" - "adcs x19, x19, x5\n\t" - "umulh x5, x27, x28\n\t" - "adc x20, xzr, xzr\n\t" - "adds x19, x19, x4\n\t" - "adc x20, x20, x5\n\t" + "mul x4, x27, x28\n\t" + "adcs x19, x19, x5\n\t" + "umulh x5, x27, x28\n\t" + "adc x20, xzr, xzr\n\t" + "adds x19, x19, x4\n\t" + "adc x20, x20, x5\n\t" "# Double\n\t" "adds x6, x6, x6\n\t" "adcs x7, x7, x7\n\t" @@ -113430,44 +114280,44 @@ static void sp_1024_sqr_8(sp_digit* r, const sp_digit* a) "adcs x17, x17, x17\n\t" "adcs x19, x19, x19\n\t" "# A[0] * A[0]\n\t" - "mul x5, x21, x21\n\t" + "mul x5, x21, x21\n\t" "adcs x20, x20, x20\n\t" - "umulh x2, x21, x21\n\t" + "umulh x2, x21, x21\n\t" "cset x21, cs\n\t" "# A[1] * A[1]\n\t" - "mul x3, x22, x22\n\t" + "mul x3, x22, x22\n\t" "adds x6, x6, x2\n\t" - "umulh x4, x22, x22\n\t" + "umulh x4, x22, x22\n\t" "adcs x7, x7, x3\n\t" "# A[2] * A[2]\n\t" - "mul x2, x23, x23\n\t" + "mul x2, x23, x23\n\t" "adcs x8, x8, x4\n\t" - "umulh x3, x23, x23\n\t" + "umulh x3, x23, x23\n\t" "adcs x9, x9, x2\n\t" "# A[3] * A[3]\n\t" - "mul x4, x24, x24\n\t" + "mul x4, x24, x24\n\t" "adcs x10, x10, x3\n\t" - "umulh x2, x24, x24\n\t" + "umulh x2, x24, x24\n\t" "adcs x11, x11, x4\n\t" "# A[4] * A[4]\n\t" - "mul x3, x25, x25\n\t" + "mul x3, x25, x25\n\t" "adcs x12, x12, x2\n\t" - "umulh x4, x25, x25\n\t" + "umulh x4, x25, x25\n\t" "adcs x13, x13, x3\n\t" "# A[5] * A[5]\n\t" - "mul x2, x26, x26\n\t" + "mul x2, x26, x26\n\t" "adcs x14, x14, x4\n\t" - "umulh x3, x26, x26\n\t" + "umulh x3, x26, x26\n\t" "adcs x15, x15, x2\n\t" "# A[6] * A[6]\n\t" - "mul x4, x27, x27\n\t" + "mul x4, x27, x27\n\t" "adcs x16, x16, x3\n\t" - "umulh x2, x27, x27\n\t" + "umulh x2, x27, x27\n\t" "adcs x17, x17, x4\n\t" "# A[7] * A[7]\n\t" - "mul x3, x28, x28\n\t" + "mul x3, x28, x28\n\t" "adcs x19, x19, x2\n\t" - "umulh x4, x28, x28\n\t" + "umulh x4, x28, x28\n\t" "adcs x20, x20, x3\n\t" "stp x5, x6, [%[r], 0]\n\t" "adc x21, x21, x4\n\t" @@ -113514,7 +114364,7 @@ static sp_digit sp_1024_add_8(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 32]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 48]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -113662,7 +114512,7 @@ static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r], 96]\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r], 112]\n\t" - "cset %[r], cs\n\t" + "adc %[r], xzr, xzr\n\t" : [r] "+r" (r) : [a] "r" (a), [b] "r" (b) : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "cc" @@ -114189,7 +115039,7 @@ static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a, "stp x3, x4, [%[r]], #16\n\t" "adcs x6, x6, x10\n\t" "stp x5, x6, [%[r]], #16\n\t" - "cset %[c], cs\n\t" + "adc %[c], xzr, xzr\n\t" "cmp %[a], x11\n\t" "b.ne 1b\n\t" : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -114381,7 +115231,7 @@ static void sp_1024_mul_d_16(sp_digit* r, const sp_digit* a, /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * - * Assumes divisor has higest bit set. + * Assumes divisor has highest bit set. * * d1 The high order half of the number to divide. * d0 The low order half of the number to divide. @@ -114763,14 +115613,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -115177,7 +116027,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m, "umulh x8, x10, x9\n\t" "adds x6, x6, x7\n\t" "adcs x8, x8, x3\n\t" - "cset x3, cs\n\t" + "adc x3, xzr, xzr\n\t" "adds x27, x28, x6\n\t" "ldr x28, [%[a], 128]\n\t" "adcs x28, x28, x8\n\t" @@ -115838,7 +116688,6 @@ static void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* ); } -#define sp_1024_mont_sub_lower_16 sp_1024_mont_sub_16 #ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. @@ -116080,7 +116929,7 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_sub_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -116202,7 +117051,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -116227,8 +117076,6 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16 -#define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -116267,7 +117114,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -116276,8 +117123,8 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -116297,7 +117144,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -116306,8 +117153,8 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -116458,12 +117305,12 @@ static int sp_1024_iszero_16(const sp_digit* a) static void sp_1024_proj_point_add_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*16; + sp_digit* t2 = t + 4*16; + sp_digit* t3 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); @@ -116485,17 +117332,9 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); @@ -116514,20 +117353,31 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_mont_dbl_16(t3, y, p1024_mod); sp_1024_mont_sub_16(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_sub_16(y, y, x, p1024_mod); sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t5, p1024_mod); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -116573,12 +117423,12 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*16; - ctx->t3 = t + 4*16; - ctx->t4 = t + 6*16; - ctx->t5 = t + 8*16; - ctx->t6 = t + 10*16; + ctx->t6 = t; + ctx->t1 = t + 2*16; + ctx->t2 = t + 4*16; + ctx->t3 = t + 6*16; + ctx->t4 = t + 8*16; + ctx->t5 = t + 10*16; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -116685,7 +117535,7 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -116698,22 +117548,28 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -116772,7 +117628,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -116782,8 +117638,8 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; @@ -116871,8 +117727,8 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_1024_mont_sub_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_lower_16(ys, ya, xs, p1024_mod); - sp_1024_mont_sub_lower_16(ya, ya, xa, p1024_mod); + sp_1024_mont_sub_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_16(ya, ya, xa, p1024_mod); sp_1024_mont_mul_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_16(ys, ys, t6, p1024_mod, p1024_mp_mod); @@ -117000,7 +117856,7 @@ static int sp_1024_ecc_mulmod_win_add_sub_16(sp_point_1024* r, const sp_point_10 (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * + t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * (65+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -117129,12 +117985,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*16; + sp_digit* t6 = t + 4*16; + sp_digit* t1 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -117150,13 +118006,9 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod); @@ -117165,33 +118017,40 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_16(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, t3, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_sqr_16(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_16(t5, t3, p1024_mod); + sp_1024_mont_sub_16(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(y, t3, t5, p1024_mod); + sp_1024_mont_mul_16(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -121053,7 +121912,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index 8500f4cd9..bf6b671c5 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -52,6 +52,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifdef WOLFSSL_SP_ARM_THUMB_ASM #define SP_PRINT_NUM(var, name, total, words, bits) \ do { \ @@ -118,14 +127,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -30304,14 +30313,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -83393,14 +83402,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -97805,14 +97814,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -100187,7 +100196,6 @@ SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, ); } -#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -100515,7 +100523,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -100637,7 +100645,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -100698,12 +100706,12 @@ static int sp_256_iszero_8(const sp_digit* a) static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*8; + sp_digit* t2 = t + 4*8; + sp_digit* t3 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); @@ -100725,17 +100733,9 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); @@ -100754,20 +100754,31 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_mont_dbl_8(t3, y, p256_mod); sp_256_mont_sub_8(x, x, t3, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -100813,12 +100824,12 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*8; - ctx->t3 = t + 4*8; - ctx->t4 = t + 6*8; - ctx->t5 = t + 8*8; - ctx->t6 = t + 10*8; + ctx->t6 = t; + ctx->t1 = t + 2*8; + ctx->t2 = t + 4*8; + ctx->t3 = t + 6*8; + ctx->t4 = t + 8*8; + ctx->t5 = t + 10*8; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -100925,7 +100936,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -100938,22 +100949,28 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -101225,8 +101242,6 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC -#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 -#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -101265,7 +101280,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -101274,8 +101289,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -101295,7 +101310,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -101304,8 +101319,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -101361,12 +101376,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*8; + sp_digit* t6 = t + 4*8; + sp_digit* t1 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -101382,13 +101397,9 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_8(t2, t2, p->x, p256_mod); @@ -101397,33 +101408,40 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(x, t1, t5, p256_mod); - sp_256_mont_dbl_8(t1, t3, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); + sp_256_mont_sqr_8(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(t2, t2, t1, p256_mod); + sp_256_mont_dbl_8(t5, t3, p256_mod); + sp_256_mont_sub_8(x, t2, t5, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); + sp_256_mont_sub_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(y, t3, t5, p256_mod); + sp_256_mont_mul_8(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -102339,7 +102357,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -103899,7 +103917,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -104130,7 +104148,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -104138,7 +104156,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -105598,7 +105616,7 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_8(t2, t3, 4); /* t = a^ff = t2 * t3 */ sp_256_mont_mul_order_8(t, t2, t3); - /* t3= a^ff00 = t ^ 2 ^ 8 */ + /* t2= a^ff00 = t ^ 2 ^ 8 */ sp_256_mont_sqr_n_order_8(t2, t, 8); /* t = a^ffff = t2 * t */ sp_256_mont_mul_order_8(t, t2, t); @@ -105615,7 +105633,11 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_8(t2, t2, t); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + sp_256_mont_sqr_order_8(t2, t2); + sp_256_mont_mul_order_8(t2, t2, a); + sp_256_mont_sqr_n_order_8(t2, t2, 5); + sp_256_mont_mul_order_8(t2, t2, t3); + for (i=121; i>=112; i--) { sp_256_mont_sqr_order_8(t2, t2); if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) { sp_256_mont_mul_order_8(t2, t2, a); @@ -109458,14 +109480,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -110864,7 +110886,6 @@ SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a, sp_384_cond_add_12(r, r, m, o); } -#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 /* Right shift a by 1 bit into r. (r = a >> 1) * * r A single precision integer. @@ -111165,7 +111186,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -111287,7 +111308,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -111350,12 +111371,12 @@ static int sp_384_iszero_12(const sp_digit* a) static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*12; + sp_digit* t2 = t + 4*12; + sp_digit* t3 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); @@ -111377,17 +111398,9 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); @@ -111406,20 +111419,31 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_mont_dbl_12(t3, y, p384_mod); sp_384_mont_sub_12(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -111465,12 +111489,12 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*12; - ctx->t3 = t + 4*12; - ctx->t4 = t + 6*12; - ctx->t5 = t + 8*12; - ctx->t6 = t + 10*12; + ctx->t6 = t; + ctx->t1 = t + 2*12; + ctx->t2 = t + 4*12; + ctx->t3 = t + 6*12; + ctx->t4 = t + 8*12; + ctx->t5 = t + 10*12; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -111577,7 +111601,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -111590,22 +111614,28 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -111901,8 +111931,6 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC -#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 -#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -111941,7 +111969,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -111950,8 +111978,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -111971,7 +111999,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -111980,8 +112008,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -112037,12 +112065,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*12; + sp_digit* t6 = t + 4*12; + sp_digit* t1 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -112058,13 +112086,9 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_12(t2, t2, p->x, p384_mod); @@ -112073,33 +112097,40 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(x, t1, t5, p384_mod); - sp_384_mont_dbl_12(t1, t3, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); + sp_384_mont_sqr_12(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(t2, t2, t1, p384_mod); + sp_384_mont_dbl_12(t5, t3, p384_mod); + sp_384_mont_sub_12(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); + sp_384_mont_sub_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(y, t3, t5, p384_mod); + sp_384_mont_mul_12(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -113047,7 +113078,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -114607,7 +114638,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -114874,7 +114905,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -114882,7 +114913,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -120688,14 +120719,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -124581,7 +124612,6 @@ SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a, ); } -#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 /* Right shift a by 1 bit into r. (r = a >> 1) * * r A single precision integer. @@ -124977,7 +125007,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -125099,7 +125129,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -125165,12 +125195,12 @@ static int sp_521_iszero_17(const sp_digit* a) static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*17; + sp_digit* t2 = t + 4*17; + sp_digit* t3 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); @@ -125192,17 +125222,9 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); @@ -125221,20 +125243,31 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_mont_dbl_17(t3, y, p521_mod); sp_521_mont_sub_17(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -125280,12 +125313,12 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*17; - ctx->t3 = t + 4*17; - ctx->t4 = t + 6*17; - ctx->t5 = t + 8*17; - ctx->t6 = t + 10*17; + ctx->t6 = t; + ctx->t1 = t + 2*17; + ctx->t2 = t + 4*17; + ctx->t3 = t + 6*17; + ctx->t4 = t + 8*17; + ctx->t5 = t + 10*17; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -125392,7 +125425,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -125405,22 +125438,28 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -125750,8 +125789,6 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC -#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 -#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -125790,7 +125827,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -125799,8 +125836,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -125820,7 +125857,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -125829,8 +125866,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -125886,12 +125923,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*17; + sp_digit* t6 = t + 4*17; + sp_digit* t1 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -125907,13 +125944,9 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_17(t2, t2, p->x, p521_mod); @@ -125922,33 +125955,40 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(x, t1, t5, p521_mod); - sp_521_mont_dbl_17(t1, t3, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); + sp_521_mont_sqr_17(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(t2, t2, t1, p521_mod); + sp_521_mont_dbl_17(t5, t3, p521_mod); + sp_521_mont_sub_17(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); + sp_521_mont_sub_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(y, t3, t5, p521_mod); + sp_521_mont_mul_17(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -126936,7 +126976,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -129040,7 +129080,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -129353,7 +129393,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -129361,7 +129401,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -202460,14 +202500,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -208859,7 +208899,6 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, ); } -#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -209609,7 +209648,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -209731,7 +209770,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -210167,12 +210206,12 @@ static int sp_1024_iszero_32(const sp_digit* a) static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*32; + sp_digit* t2 = t + 4*32; + sp_digit* t3 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); @@ -210194,17 +210233,9 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); @@ -210223,20 +210254,31 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_mont_dbl_32(t3, y, p1024_mod); sp_1024_mont_sub_32(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -210282,12 +210324,12 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*32; - ctx->t3 = t + 4*32; - ctx->t4 = t + 6*32; - ctx->t5 = t + 8*32; - ctx->t6 = t + 10*32; + ctx->t6 = t; + ctx->t1 = t + 2*32; + ctx->t2 = t + 4*32; + ctx->t3 = t + 6*32; + ctx->t4 = t + 8*32; + ctx->t5 = t + 10*32; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -210394,7 +210436,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -210407,22 +210449,28 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -210582,8 +210630,6 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) -#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 -#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -210622,7 +210668,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -210631,8 +210677,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -210652,7 +210698,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -210661,8 +210707,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -210718,12 +210764,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*32; + sp_digit* t6 = t + 4*32; + sp_digit* t1 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -210739,13 +210785,9 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); @@ -210754,33 +210796,40 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, t3, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); + sp_1024_mont_sqr_32(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_32(t5, t3, p1024_mod); + sp_1024_mont_sub_32(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + sp_1024_mont_mul_32(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -215214,7 +215263,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 39423695b..dc5c3385d 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -56,6 +56,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifndef WOLFSSL_SP_ASM #if SP_WORD_SIZE == 32 #define SP_PRINT_NUM(var, name, total, words, bits) \ @@ -139,14 +148,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 29 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 28); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 28); } #elif DIGIT_BIT > 29 unsigned int i; @@ -4900,14 +4909,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 29 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 28); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 28); } #elif DIGIT_BIT > 29 unsigned int i; @@ -8205,14 +8214,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 28 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 27); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 27); } #elif DIGIT_BIT > 28 unsigned int i; @@ -12548,14 +12557,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 29 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 28); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 28); } #elif DIGIT_BIT > 29 unsigned int i; @@ -15721,14 +15730,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 26 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 25); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 25); } #elif DIGIT_BIT > 26 unsigned int i; @@ -20442,14 +20451,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 29 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 28); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 28); } #elif DIGIT_BIT > 29 unsigned int i; @@ -21217,7 +21226,6 @@ static void sp_256_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_256_norm_9(r); } -#define sp_256_mont_sub_lower_9 sp_256_mont_sub_9 /* Shift number left one bit. * Bottom bit is lost. * @@ -21312,7 +21320,7 @@ static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_9(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_9(y, y, x, p256_mod); + sp_256_mont_sub_9(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -21434,7 +21442,7 @@ static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -21496,12 +21504,12 @@ static int sp_256_iszero_9(const sp_digit* a) static void sp_256_proj_point_add_9(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*9; + sp_digit* t2 = t + 4*9; + sp_digit* t3 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod); @@ -21523,17 +21531,9 @@ static void sp_256_proj_point_add_9(sp_point_256* r, sp_256_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_9(t2, t2, t1, p256_mod); @@ -21552,20 +21552,31 @@ static void sp_256_proj_point_add_9(sp_point_256* r, sp_256_mont_dbl_9(t3, y, p256_mod); sp_256_mont_sub_9(x, x, t3, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_9(y, y, x, p256_mod); + sp_256_mont_sub_9(y, y, x, p256_mod); sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_9(y, y, t5, p256_mod); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -21611,12 +21622,12 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*9; - ctx->t3 = t + 4*9; - ctx->t4 = t + 6*9; - ctx->t5 = t + 8*9; - ctx->t6 = t + 10*9; + ctx->t6 = t; + ctx->t1 = t + 2*9; + ctx->t2 = t + 4*9; + ctx->t3 = t + 6*9; + ctx->t4 = t + 8*9; + ctx->t5 = t + 10*9; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -21723,7 +21734,7 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -21736,22 +21747,28 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -22172,8 +22189,6 @@ static void sp_256_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } -#define sp_256_mont_dbl_lower_9 sp_256_mont_dbl_9 -#define sp_256_mont_tpl_lower_9 sp_256_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -22212,7 +22227,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_9(a, t1, p256_mod); + sp_256_mont_tpl_9(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); @@ -22221,8 +22236,8 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i, sp_256_mont_dbl_9(t2, b, p256_mod); sp_256_mont_sub_9(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_9(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_9(b, t2, p256_mod); + sp_256_mont_sub_9(t2, b, x, p256_mod); + sp_256_mont_dbl_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -22242,7 +22257,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_9(a, t1, p256_mod); + sp_256_mont_tpl_9(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); @@ -22251,8 +22266,8 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i, sp_256_mont_dbl_9(t2, b, p256_mod); sp_256_mont_sub_9(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_9(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_9(b, t2, p256_mod); + sp_256_mont_sub_9(t2, b, x, p256_mod); + sp_256_mont_dbl_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -22308,7 +22323,7 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_9(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_9(a, t1, p256_mod); + sp_256_mont_tpl_9(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod); @@ -22318,8 +22333,8 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r, sp_256_mont_dbl_9(t2, b, p256_mod); sp_256_mont_sub_9(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_9(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_9(b, t2, p256_mod); + sp_256_mont_sub_9(t2, b, x, p256_mod); + sp_256_mont_dbl_9(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_9(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; @@ -22407,8 +22422,8 @@ static void sp_256_proj_point_add_sub_9(sp_point_256* ra, sp_256_mont_sub_9(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_lower_9(ys, ya, xs, p256_mod); - sp_256_mont_sub_lower_9(ya, ya, xa, p256_mod); + sp_256_mont_sub_9(ys, ya, xs, p256_mod); + sp_256_mont_sub_9(ya, ya, xa, p256_mod); sp_256_mont_mul_9(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_9(t6, p256_mod, t6); sp_256_mont_mul_9(ys, ys, t6, p256_mod, p256_mp_mod); @@ -22600,7 +22615,7 @@ static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * + t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -22719,12 +22734,12 @@ static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256* static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*9; + sp_digit* t6 = t + 4*9; + sp_digit* t1 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -22740,13 +22755,9 @@ static void sp_256_proj_point_add_qz1_9(sp_point_256* r, sp_256_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_9(t2, t2, p->x, p256_mod); @@ -22755,33 +22766,40 @@ static void sp_256_proj_point_add_qz1_9(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_9(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_9(x, t1, t5, p256_mod); - sp_256_mont_dbl_9(t1, t3, p256_mod); - sp_256_mont_sub_9(x, x, t1, p256_mod); + sp_256_mont_sqr_9(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_9(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_9(t2, t2, t1, p256_mod); + sp_256_mont_dbl_9(t5, t3, p256_mod); + sp_256_mont_sub_9(x, t2, t5, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_9(t3, t3, x, p256_mod); + sp_256_mont_sub_9(t3, t3, x, p256_mod); sp_256_mont_mul_9(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_9(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_9(y, t3, t5, p256_mod); + sp_256_mont_mul_9(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_9(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -23302,7 +23320,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -24781,7 +24799,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -24935,7 +24953,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -24943,7 +24961,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -25578,7 +25596,7 @@ static void sp_256_mont_inv_order_9(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_9(t2, t3, 4); /* t = a^ff = t2 * t3 */ sp_256_mont_mul_order_9(t, t2, t3); - /* t3= a^ff00 = t ^ 2 ^ 8 */ + /* t2= a^ff00 = t ^ 2 ^ 8 */ sp_256_mont_sqr_n_order_9(t2, t, 8); /* t = a^ffff = t2 * t */ sp_256_mont_mul_order_9(t, t2, t); @@ -25595,7 +25613,11 @@ static void sp_256_mont_inv_order_9(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_9(t2, t2, t); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + sp_256_mont_sqr_order_9(t2, t2); + sp_256_mont_mul_order_9(t2, t2, a); + sp_256_mont_sqr_n_order_9(t2, t2, 5); + sp_256_mont_mul_order_9(t2, t2, t3); + for (i=121; i>=112; i--) { sp_256_mont_sqr_order_9(t2, t2); if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) { sp_256_mont_mul_order_9(t2, t2, a); @@ -27789,14 +27811,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 26 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 25); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 25); } #elif DIGIT_BIT > 26 unsigned int i; @@ -28630,7 +28652,6 @@ static void sp_384_mont_sub_15(sp_digit* r, const sp_digit* a, const sp_digit* b sp_384_norm_15(r); } -#define sp_384_mont_sub_lower_15 sp_384_mont_sub_15 /* Shift number left one bit. * Bottom bit is lost. * @@ -28731,7 +28752,7 @@ static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_15(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_15(y, y, x, p384_mod); + sp_384_mont_sub_15(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_15(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -28853,7 +28874,7 @@ static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -28917,12 +28938,12 @@ static int sp_384_iszero_15(const sp_digit* a) static void sp_384_proj_point_add_15(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*15; - sp_digit* t3 = t + 4*15; - sp_digit* t4 = t + 6*15; - sp_digit* t5 = t + 8*15; - sp_digit* t6 = t + 10*15; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*15; + sp_digit* t2 = t + 4*15; + sp_digit* t3 = t + 6*15; + sp_digit* t4 = t + 8*15; + sp_digit* t5 = t + 10*15; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod); @@ -28944,17 +28965,9 @@ static void sp_384_proj_point_add_15(sp_point_384* r, sp_384_proj_point_dbl_15(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_15(t2, t2, t1, p384_mod); @@ -28973,20 +28986,31 @@ static void sp_384_proj_point_add_15(sp_point_384* r, sp_384_mont_dbl_15(t3, y, p384_mod); sp_384_mont_sub_15(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_15(y, y, x, p384_mod); + sp_384_mont_sub_15(y, y, x, p384_mod); sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_15(y, y, t5, p384_mod); - for (i = 0; i < 15; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 15; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 15; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -29032,12 +29056,12 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*15; - ctx->t3 = t + 4*15; - ctx->t4 = t + 6*15; - ctx->t5 = t + 8*15; - ctx->t6 = t + 10*15; + ctx->t6 = t; + ctx->t1 = t + 2*15; + ctx->t2 = t + 4*15; + ctx->t3 = t + 6*15; + ctx->t4 = t + 8*15; + ctx->t5 = t + 10*15; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -29144,7 +29168,7 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -29157,22 +29181,28 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 15; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 15; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 15; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -29649,8 +29679,6 @@ static void sp_384_cond_copy_15(sp_digit* r, const sp_digit* a, const sp_digit m #endif /* WOLFSSL_SP_SMALL */ } -#define sp_384_mont_dbl_lower_15 sp_384_mont_dbl_15 -#define sp_384_mont_tpl_lower_15 sp_384_mont_tpl_15 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -29689,7 +29717,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_15(a, t1, p384_mod); + sp_384_mont_tpl_15(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); @@ -29698,8 +29726,8 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i, sp_384_mont_dbl_15(t2, b, p384_mod); sp_384_mont_sub_15(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_15(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_15(b, t2, p384_mod); + sp_384_mont_sub_15(t2, b, x, p384_mod); + sp_384_mont_dbl_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -29719,7 +29747,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_15(a, t1, p384_mod); + sp_384_mont_tpl_15(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); @@ -29728,8 +29756,8 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i, sp_384_mont_dbl_15(t2, b, p384_mod); sp_384_mont_sub_15(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_15(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_15(b, t2, p384_mod); + sp_384_mont_sub_15(t2, b, x, p384_mod); + sp_384_mont_dbl_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -29785,7 +29813,7 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_15(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_15(a, t1, p384_mod); + sp_384_mont_tpl_15(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod); @@ -29795,8 +29823,8 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r, sp_384_mont_dbl_15(t2, b, p384_mod); sp_384_mont_sub_15(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_15(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_15(b, t2, p384_mod); + sp_384_mont_sub_15(t2, b, x, p384_mod); + sp_384_mont_dbl_15(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_15(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; @@ -29884,8 +29912,8 @@ static void sp_384_proj_point_add_sub_15(sp_point_384* ra, sp_384_mont_sub_15(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_lower_15(ys, ya, xs, p384_mod); - sp_384_mont_sub_lower_15(ya, ya, xa, p384_mod); + sp_384_mont_sub_15(ys, ya, xs, p384_mod); + sp_384_mont_sub_15(ya, ya, xa, p384_mod); sp_384_mont_mul_15(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_15(t6, p384_mod, t6); sp_384_mont_mul_15(ys, ys, t6, p384_mod, p384_mp_mod); @@ -30113,7 +30141,7 @@ static int sp_384_ecc_mulmod_win_add_sub_15(sp_point_384* r, const sp_point_384* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * + t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -30232,12 +30260,12 @@ static int sp_384_ecc_mulmod_win_add_sub_15(sp_point_384* r, const sp_point_384* static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*15; - sp_digit* t3 = t + 4*15; - sp_digit* t4 = t + 6*15; - sp_digit* t5 = t + 8*15; - sp_digit* t6 = t + 10*15; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*15; + sp_digit* t6 = t + 4*15; + sp_digit* t1 = t + 6*15; + sp_digit* t4 = t + 8*15; + sp_digit* t5 = t + 10*15; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -30253,13 +30281,9 @@ static void sp_384_proj_point_add_qz1_15(sp_point_384* r, sp_384_proj_point_dbl_15(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_15(t2, t2, p->x, p384_mod); @@ -30268,33 +30292,40 @@ static void sp_384_proj_point_add_qz1_15(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_15(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_15(x, t1, t5, p384_mod); - sp_384_mont_dbl_15(t1, t3, p384_mod); - sp_384_mont_sub_15(x, x, t1, p384_mod); + sp_384_mont_sqr_15(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_15(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_15(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_15(t2, t2, t1, p384_mod); + sp_384_mont_dbl_15(t5, t3, p384_mod); + sp_384_mont_sub_15(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_15(t3, t3, x, p384_mod); + sp_384_mont_sub_15(t3, t3, x, p384_mod); sp_384_mont_mul_15(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_15(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_15(y, t3, t5, p384_mod); + sp_384_mont_mul_15(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_15(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 15; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 15; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 15; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 15; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 15; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -30839,7 +30870,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -32830,7 +32861,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -32984,7 +33015,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -32992,7 +33023,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -35513,14 +35544,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 25 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 24); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 24); } #elif DIGIT_BIT > 25 unsigned int i; @@ -36289,7 +36320,6 @@ static void sp_521_mont_sub_21(sp_digit* r, const sp_digit* a, const sp_digit* b sp_521_norm_21(r); } -#define sp_521_mont_sub_lower_21 sp_521_mont_sub_21 /* Shift number left one bit. * Bottom bit is lost. * @@ -36396,7 +36426,7 @@ static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_21(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_21(y, y, x, p521_mod); + sp_521_mont_sub_21(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_21(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -36518,7 +36548,7 @@ static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_21(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -36585,12 +36615,12 @@ static int sp_521_iszero_21(const sp_digit* a) static void sp_521_proj_point_add_21(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*21; - sp_digit* t3 = t + 4*21; - sp_digit* t4 = t + 6*21; - sp_digit* t5 = t + 8*21; - sp_digit* t6 = t + 10*21; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*21; + sp_digit* t2 = t + 4*21; + sp_digit* t3 = t + 6*21; + sp_digit* t4 = t + 8*21; + sp_digit* t5 = t + 10*21; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_21(t1, q->z, p521_mod, p521_mp_mod); @@ -36612,17 +36642,9 @@ static void sp_521_proj_point_add_21(sp_point_521* r, sp_521_proj_point_dbl_21(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_21(t2, t2, t1, p521_mod); @@ -36641,20 +36663,31 @@ static void sp_521_proj_point_add_21(sp_point_521* r, sp_521_mont_dbl_21(t3, y, p521_mod); sp_521_mont_sub_21(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_21(y, y, x, p521_mod); + sp_521_mont_sub_21(y, y, x, p521_mod); sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_21(y, y, t5, p521_mod); - for (i = 0; i < 21; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 21; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 21; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -36700,12 +36733,12 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*21; - ctx->t3 = t + 4*21; - ctx->t4 = t + 6*21; - ctx->t5 = t + 8*21; - ctx->t6 = t + 10*21; + ctx->t6 = t; + ctx->t1 = t + 2*21; + ctx->t2 = t + 4*21; + ctx->t3 = t + 6*21; + ctx->t4 = t + 8*21; + ctx->t5 = t + 10*21; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -36812,7 +36845,7 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_21(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -36825,22 +36858,28 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 21; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 21; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 21; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -37179,8 +37218,6 @@ static void sp_521_cond_copy_21(sp_digit* r, const sp_digit* a, const sp_digit m #endif /* WOLFSSL_SP_SMALL */ } -#define sp_521_mont_dbl_lower_21 sp_521_mont_dbl_21 -#define sp_521_mont_tpl_lower_21 sp_521_mont_tpl_21 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -37219,7 +37256,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_21(a, t1, p521_mod); + sp_521_mont_tpl_21(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); @@ -37228,8 +37265,8 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i, sp_521_mont_dbl_21(t2, b, p521_mod); sp_521_mont_sub_21(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_21(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_21(b, t2, p521_mod); + sp_521_mont_sub_21(t2, b, x, p521_mod); + sp_521_mont_dbl_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -37249,7 +37286,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_21(a, t1, p521_mod); + sp_521_mont_tpl_21(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); @@ -37258,8 +37295,8 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i, sp_521_mont_dbl_21(t2, b, p521_mod); sp_521_mont_sub_21(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_21(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_21(b, t2, p521_mod); + sp_521_mont_sub_21(t2, b, x, p521_mod); + sp_521_mont_dbl_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -37315,7 +37352,7 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_21(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_21(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_21(a, t1, p521_mod); + sp_521_mont_tpl_21(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_21(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_21(b, t1, x, p521_mod, p521_mp_mod); @@ -37325,8 +37362,8 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r, sp_521_mont_dbl_21(t2, b, p521_mod); sp_521_mont_sub_21(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_21(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_21(b, t2, p521_mod); + sp_521_mont_sub_21(t2, b, x, p521_mod); + sp_521_mont_dbl_21(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_21(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; @@ -37414,8 +37451,8 @@ static void sp_521_proj_point_add_sub_21(sp_point_521* ra, sp_521_mont_sub_21(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_lower_21(ys, ya, xs, p521_mod); - sp_521_mont_sub_lower_21(ya, ya, xa, p521_mod); + sp_521_mont_sub_21(ys, ya, xs, p521_mod); + sp_521_mont_sub_21(ya, ya, xa, p521_mod); sp_521_mont_mul_21(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_21(t6, p521_mod, t6); sp_521_mont_mul_21(ys, ys, t6, p521_mod, p521_mp_mod); @@ -37679,7 +37716,7 @@ static int sp_521_ecc_mulmod_win_add_sub_21(sp_point_521* r, const sp_point_521* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * + t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -37798,12 +37835,12 @@ static int sp_521_ecc_mulmod_win_add_sub_21(sp_point_521* r, const sp_point_521* static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*21; - sp_digit* t3 = t + 4*21; - sp_digit* t4 = t + 6*21; - sp_digit* t5 = t + 8*21; - sp_digit* t6 = t + 10*21; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*21; + sp_digit* t6 = t + 4*21; + sp_digit* t1 = t + 6*21; + sp_digit* t4 = t + 8*21; + sp_digit* t5 = t + 10*21; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -37819,13 +37856,9 @@ static void sp_521_proj_point_add_qz1_21(sp_point_521* r, sp_521_proj_point_dbl_21(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_21(t2, t2, p->x, p521_mod); @@ -37834,33 +37867,40 @@ static void sp_521_proj_point_add_qz1_21(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_21(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_21(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_21(x, t1, t5, p521_mod); - sp_521_mont_dbl_21(t1, t3, p521_mod); - sp_521_mont_sub_21(x, x, t1, p521_mod); + sp_521_mont_sqr_21(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_21(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_21(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_21(t2, t2, t1, p521_mod); + sp_521_mont_dbl_21(t5, t3, p521_mod); + sp_521_mont_sub_21(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_21(t3, t3, x, p521_mod); + sp_521_mont_sub_21(t3, t3, x, p521_mod); sp_521_mont_mul_21(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_21(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_21(y, t3, t5, p521_mod); + sp_521_mont_mul_21(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_21(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 21; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 21; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 21; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 21; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 21; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -38429,7 +38469,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -40930,7 +40970,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -41085,7 +41125,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -41093,7 +41133,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -44421,14 +44461,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 25 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 24); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 24); } #elif DIGIT_BIT > 25 unsigned int i; @@ -45055,7 +45095,6 @@ static void sp_1024_mont_sub_42(sp_digit* r, const sp_digit* a, const sp_digit* sp_1024_norm_42(r); } -#define sp_1024_mont_sub_lower_42 sp_1024_mont_sub_42 /* Shift number left one bit. * Bottom bit is lost. * @@ -45183,7 +45222,7 @@ static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_42(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_42(y, y, x, p1024_mod); + sp_1024_mont_sub_42(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_42(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -45305,7 +45344,7 @@ static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -45382,12 +45421,12 @@ static int sp_1024_iszero_42(const sp_digit* a) static void sp_1024_proj_point_add_42(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*42; - sp_digit* t3 = t + 4*42; - sp_digit* t4 = t + 6*42; - sp_digit* t5 = t + 8*42; - sp_digit* t6 = t + 10*42; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*42; + sp_digit* t2 = t + 4*42; + sp_digit* t3 = t + 6*42; + sp_digit* t4 = t + 8*42; + sp_digit* t5 = t + 10*42; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_42(t1, q->z, p1024_mod, p1024_mp_mod); @@ -45409,17 +45448,9 @@ static void sp_1024_proj_point_add_42(sp_point_1024* r, sp_1024_proj_point_dbl_42(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_42(t2, t2, t1, p1024_mod); @@ -45438,20 +45469,31 @@ static void sp_1024_proj_point_add_42(sp_point_1024* r, sp_1024_mont_dbl_42(t3, y, p1024_mod); sp_1024_mont_sub_42(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_42(y, y, x, p1024_mod); + sp_1024_mont_sub_42(y, y, x, p1024_mod); sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(y, y, t5, p1024_mod); - for (i = 0; i < 42; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 42; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 42; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -45497,12 +45539,12 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*42; - ctx->t3 = t + 4*42; - ctx->t4 = t + 6*42; - ctx->t5 = t + 8*42; - ctx->t6 = t + 10*42; + ctx->t6 = t; + ctx->t1 = t + 2*42; + ctx->t2 = t + 4*42; + ctx->t3 = t + 6*42; + ctx->t4 = t + 8*42; + ctx->t5 = t + 10*42; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -45609,7 +45651,7 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -45622,22 +45664,28 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 42; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 42; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 42; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -46000,8 +46048,6 @@ static void sp_1024_cond_copy_42(sp_digit* r, const sp_digit* a, const sp_digit #endif /* WOLFSSL_SP_SMALL */ } -#define sp_1024_mont_dbl_lower_42 sp_1024_mont_dbl_42 -#define sp_1024_mont_tpl_lower_42 sp_1024_mont_tpl_42 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -46040,7 +46086,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); + sp_1024_mont_tpl_42(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); @@ -46049,8 +46095,8 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i, sp_1024_mont_dbl_42(t2, b, p1024_mod); sp_1024_mont_sub_42(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); + sp_1024_mont_sub_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -46070,7 +46116,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); + sp_1024_mont_tpl_42(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); @@ -46079,8 +46125,8 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i, sp_1024_mont_dbl_42(t2, b, p1024_mod); sp_1024_mont_sub_42(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); + sp_1024_mont_sub_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -46136,7 +46182,7 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_42(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_42(a, t1, p1024_mod); + sp_1024_mont_tpl_42(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_42(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_42(b, t1, x, p1024_mod, p1024_mp_mod); @@ -46146,8 +46192,8 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r, sp_1024_mont_dbl_42(t2, b, p1024_mod); sp_1024_mont_sub_42(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_42(b, t2, p1024_mod); + sp_1024_mont_sub_42(t2, b, x, p1024_mod); + sp_1024_mont_dbl_42(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_42(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; @@ -46235,8 +46281,8 @@ static void sp_1024_proj_point_add_sub_42(sp_point_1024* ra, sp_1024_mont_sub_42(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_lower_42(ys, ya, xs, p1024_mod); - sp_1024_mont_sub_lower_42(ya, ya, xa, p1024_mod); + sp_1024_mont_sub_42(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_42(ya, ya, xa, p1024_mod); sp_1024_mont_mul_42(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_42(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_42(ys, ys, t6, p1024_mod, p1024_mp_mod); @@ -46364,7 +46410,7 @@ static int sp_1024_ecc_mulmod_win_add_sub_42(sp_point_1024* r, const sp_point_10 (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * + t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * (65+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -46487,12 +46533,12 @@ static int sp_1024_ecc_mulmod_win_add_sub_42(sp_point_1024* r, const sp_point_10 static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*42; - sp_digit* t3 = t + 4*42; - sp_digit* t4 = t + 6*42; - sp_digit* t5 = t + 8*42; - sp_digit* t6 = t + 10*42; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*42; + sp_digit* t6 = t + 4*42; + sp_digit* t1 = t + 6*42; + sp_digit* t4 = t + 8*42; + sp_digit* t5 = t + 10*42; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -46508,13 +46554,9 @@ static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, sp_1024_proj_point_dbl_42(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_42(t2, t2, p->x, p1024_mod); @@ -46523,33 +46565,40 @@ static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_42(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_42(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_42(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_42(t1, t3, p1024_mod); - sp_1024_mont_sub_42(x, x, t1, p1024_mod); + sp_1024_mont_sqr_42(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_42(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_42(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_42(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_42(t5, t3, p1024_mod); + sp_1024_mont_sub_42(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_42(t3, t3, x, p1024_mod); + sp_1024_mont_sub_42(t3, t3, x, p1024_mod); sp_1024_mont_mul_42(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_42(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_42(y, t3, t5, p1024_mod); + sp_1024_mont_mul_42(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_42(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 42; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 42; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 42; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 42; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 42; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -50956,7 +51005,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 060348b64..d9a55dbe6 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -56,6 +56,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifndef WOLFSSL_SP_ASM #if SP_WORD_SIZE == 64 #define SP_PRINT_NUM(var, name, total, words, bits) \ @@ -140,14 +149,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 61 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 60); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 60); } #elif DIGIT_BIT > 61 unsigned int i; @@ -3419,14 +3428,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 57 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 56); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 56); } #elif DIGIT_BIT > 57 unsigned int i; @@ -7092,14 +7101,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 60 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 59); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 59); } #elif DIGIT_BIT > 60 unsigned int i; @@ -10302,14 +10311,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 57 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 56); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 56); } #elif DIGIT_BIT > 57 unsigned int i; @@ -14160,14 +14169,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 59 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 58); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 58); } #elif DIGIT_BIT > 59 unsigned int i; @@ -17229,14 +17238,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 53 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 52); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 52); } #elif DIGIT_BIT > 53 unsigned int i; @@ -21481,14 +21490,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 52 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 51); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 51); } #elif DIGIT_BIT > 52 unsigned int i; @@ -22193,7 +22202,6 @@ static void sp_256_mont_sub_5(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_256_norm_5(r); } -#define sp_256_mont_sub_lower_5 sp_256_mont_sub_5 /* Shift number left one bit. * Bottom bit is lost. * @@ -22284,7 +22292,7 @@ static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_5(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_5(y, y, x, p256_mod); + sp_256_mont_sub_5(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_5(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -22406,7 +22414,7 @@ static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_5(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -22466,12 +22474,12 @@ static int sp_256_iszero_5(const sp_digit* a) static void sp_256_proj_point_add_5(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*5; - sp_digit* t3 = t + 4*5; - sp_digit* t4 = t + 6*5; - sp_digit* t5 = t + 8*5; - sp_digit* t6 = t + 10*5; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*5; + sp_digit* t2 = t + 4*5; + sp_digit* t3 = t + 6*5; + sp_digit* t4 = t + 8*5; + sp_digit* t5 = t + 10*5; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_5(t1, q->z, p256_mod, p256_mp_mod); @@ -22493,17 +22501,9 @@ static void sp_256_proj_point_add_5(sp_point_256* r, sp_256_proj_point_dbl_5(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_5(t2, t2, t1, p256_mod); @@ -22522,20 +22522,31 @@ static void sp_256_proj_point_add_5(sp_point_256* r, sp_256_mont_dbl_5(t3, y, p256_mod); sp_256_mont_sub_5(x, x, t3, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_5(y, y, x, p256_mod); + sp_256_mont_sub_5(y, y, x, p256_mod); sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_5(y, y, t5, p256_mod); - for (i = 0; i < 5; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 5; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 5; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -22581,12 +22592,12 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*5; - ctx->t3 = t + 4*5; - ctx->t4 = t + 6*5; - ctx->t5 = t + 8*5; - ctx->t6 = t + 10*5; + ctx->t6 = t; + ctx->t1 = t + 2*5; + ctx->t2 = t + 4*5; + ctx->t3 = t + 6*5; + ctx->t4 = t + 8*5; + ctx->t5 = t + 10*5; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -22693,7 +22704,7 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_5(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -22706,22 +22717,28 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 5; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 5; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 5; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -23119,8 +23136,6 @@ static void sp_256_cond_copy_5(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } -#define sp_256_mont_dbl_lower_5 sp_256_mont_dbl_5 -#define sp_256_mont_tpl_lower_5 sp_256_mont_tpl_5 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -23159,7 +23174,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_5(a, t1, p256_mod); + sp_256_mont_tpl_5(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); @@ -23168,8 +23183,8 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i, sp_256_mont_dbl_5(t2, b, p256_mod); sp_256_mont_sub_5(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_5(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_5(b, t2, p256_mod); + sp_256_mont_sub_5(t2, b, x, p256_mod); + sp_256_mont_dbl_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -23189,7 +23204,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_5(a, t1, p256_mod); + sp_256_mont_tpl_5(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); @@ -23198,8 +23213,8 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i, sp_256_mont_dbl_5(t2, b, p256_mod); sp_256_mont_sub_5(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_5(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_5(b, t2, p256_mod); + sp_256_mont_sub_5(t2, b, x, p256_mod); + sp_256_mont_dbl_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -23255,7 +23270,7 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_5(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_5(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_5(a, t1, p256_mod); + sp_256_mont_tpl_5(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_5(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_5(b, t1, x, p256_mod, p256_mp_mod); @@ -23265,8 +23280,8 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r, sp_256_mont_dbl_5(t2, b, p256_mod); sp_256_mont_sub_5(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_5(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_5(b, t2, p256_mod); + sp_256_mont_sub_5(t2, b, x, p256_mod); + sp_256_mont_dbl_5(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_5(r[j].z, z, y, p256_mod, p256_mp_mod); z = r[j].z; @@ -23354,8 +23369,8 @@ static void sp_256_proj_point_add_sub_5(sp_point_256* ra, sp_256_mont_sub_5(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_lower_5(ys, ya, xs, p256_mod); - sp_256_mont_sub_lower_5(ya, ya, xa, p256_mod); + sp_256_mont_sub_5(ys, ya, xs, p256_mod); + sp_256_mont_sub_5(ya, ya, xa, p256_mod); sp_256_mont_mul_5(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_5(t6, p256_mod, t6); sp_256_mont_mul_5(ys, ys, t6, p256_mod, p256_mp_mod); @@ -23523,7 +23538,7 @@ static int sp_256_ecc_mulmod_win_add_sub_5(sp_point_256* r, const sp_point_256* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * + t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -23642,12 +23657,12 @@ static int sp_256_ecc_mulmod_win_add_sub_5(sp_point_256* r, const sp_point_256* static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*5; - sp_digit* t3 = t + 4*5; - sp_digit* t4 = t + 6*5; - sp_digit* t5 = t + 8*5; - sp_digit* t6 = t + 10*5; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*5; + sp_digit* t6 = t + 4*5; + sp_digit* t1 = t + 6*5; + sp_digit* t4 = t + 8*5; + sp_digit* t5 = t + 10*5; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -23663,13 +23678,9 @@ static void sp_256_proj_point_add_qz1_5(sp_point_256* r, sp_256_proj_point_dbl_5(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_5(t2, t2, p->x, p256_mod); @@ -23678,33 +23689,40 @@ static void sp_256_proj_point_add_qz1_5(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_5(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_5(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_5(x, t1, t5, p256_mod); - sp_256_mont_dbl_5(t1, t3, p256_mod); - sp_256_mont_sub_5(x, x, t1, p256_mod); + sp_256_mont_sqr_5(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_5(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_5(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_5(t2, t2, t1, p256_mod); + sp_256_mont_dbl_5(t5, t3, p256_mod); + sp_256_mont_sub_5(x, t2, t5, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_5(t3, t3, x, p256_mod); + sp_256_mont_sub_5(t3, t3, x, p256_mod); sp_256_mont_mul_5(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_5(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_5(y, t3, t5, p256_mod); + sp_256_mont_mul_5(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_5(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 5; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 5; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 5; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 5; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 5; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -24209,7 +24227,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -25688,7 +25706,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -25842,7 +25860,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -25850,7 +25868,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -26471,7 +26489,7 @@ static void sp_256_mont_inv_order_5(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_5(t2, t3, 4); /* t = a^ff = t2 * t3 */ sp_256_mont_mul_order_5(t, t2, t3); - /* t3= a^ff00 = t ^ 2 ^ 8 */ + /* t2= a^ff00 = t ^ 2 ^ 8 */ sp_256_mont_sqr_n_order_5(t2, t, 8); /* t = a^ffff = t2 * t */ sp_256_mont_mul_order_5(t, t2, t); @@ -26488,7 +26506,11 @@ static void sp_256_mont_inv_order_5(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_5(t2, t2, t); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + sp_256_mont_sqr_order_5(t2, t2); + sp_256_mont_mul_order_5(t2, t2, a); + sp_256_mont_sqr_n_order_5(t2, t2, 5); + sp_256_mont_mul_order_5(t2, t2, t3); + for (i=121; i>=112; i--) { sp_256_mont_sqr_order_5(t2, t2); if ((p256_order_low[i / 64] & ((sp_int_digit)1 << (i % 64))) != 0) { sp_256_mont_mul_order_5(t2, t2, a); @@ -28347,14 +28369,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 55 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 54); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 54); } #elif DIGIT_BIT > 55 unsigned int i; @@ -29098,7 +29120,6 @@ static void sp_384_mont_sub_7(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_384_norm_7(r); } -#define sp_384_mont_sub_lower_7 sp_384_mont_sub_7 /* Shift number left one bit. * Bottom bit is lost. * @@ -29191,7 +29212,7 @@ static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_7(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_7(y, y, x, p384_mod); + sp_384_mont_sub_7(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_7(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -29313,7 +29334,7 @@ static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_7(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -29374,12 +29395,12 @@ static int sp_384_iszero_7(const sp_digit* a) static void sp_384_proj_point_add_7(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*7; - sp_digit* t3 = t + 4*7; - sp_digit* t4 = t + 6*7; - sp_digit* t5 = t + 8*7; - sp_digit* t6 = t + 10*7; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*7; + sp_digit* t2 = t + 4*7; + sp_digit* t3 = t + 6*7; + sp_digit* t4 = t + 8*7; + sp_digit* t5 = t + 10*7; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_7(t1, q->z, p384_mod, p384_mp_mod); @@ -29401,17 +29422,9 @@ static void sp_384_proj_point_add_7(sp_point_384* r, sp_384_proj_point_dbl_7(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_7(t2, t2, t1, p384_mod); @@ -29430,20 +29443,31 @@ static void sp_384_proj_point_add_7(sp_point_384* r, sp_384_mont_dbl_7(t3, y, p384_mod); sp_384_mont_sub_7(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_7(y, y, x, p384_mod); + sp_384_mont_sub_7(y, y, x, p384_mod); sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_7(y, y, t5, p384_mod); - for (i = 0; i < 7; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 7; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 7; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -29489,12 +29513,12 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*7; - ctx->t3 = t + 4*7; - ctx->t4 = t + 6*7; - ctx->t5 = t + 8*7; - ctx->t6 = t + 10*7; + ctx->t6 = t; + ctx->t1 = t + 2*7; + ctx->t2 = t + 4*7; + ctx->t3 = t + 6*7; + ctx->t4 = t + 8*7; + ctx->t5 = t + 10*7; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -29601,7 +29625,7 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_7(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -29614,22 +29638,28 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 7; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 7; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 7; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -30063,8 +30093,6 @@ static void sp_384_cond_copy_7(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } -#define sp_384_mont_dbl_lower_7 sp_384_mont_dbl_7 -#define sp_384_mont_tpl_lower_7 sp_384_mont_tpl_7 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -30103,7 +30131,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_7(a, t1, p384_mod); + sp_384_mont_tpl_7(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); @@ -30112,8 +30140,8 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i, sp_384_mont_dbl_7(t2, b, p384_mod); sp_384_mont_sub_7(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_7(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_7(b, t2, p384_mod); + sp_384_mont_sub_7(t2, b, x, p384_mod); + sp_384_mont_dbl_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -30133,7 +30161,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_7(a, t1, p384_mod); + sp_384_mont_tpl_7(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); @@ -30142,8 +30170,8 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i, sp_384_mont_dbl_7(t2, b, p384_mod); sp_384_mont_sub_7(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_7(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_7(b, t2, p384_mod); + sp_384_mont_sub_7(t2, b, x, p384_mod); + sp_384_mont_dbl_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -30199,7 +30227,7 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_7(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_7(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_7(a, t1, p384_mod); + sp_384_mont_tpl_7(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_7(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_7(b, t1, x, p384_mod, p384_mp_mod); @@ -30209,8 +30237,8 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r, sp_384_mont_dbl_7(t2, b, p384_mod); sp_384_mont_sub_7(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_7(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_7(b, t2, p384_mod); + sp_384_mont_sub_7(t2, b, x, p384_mod); + sp_384_mont_dbl_7(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_7(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; @@ -30298,8 +30326,8 @@ static void sp_384_proj_point_add_sub_7(sp_point_384* ra, sp_384_mont_sub_7(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_lower_7(ys, ya, xs, p384_mod); - sp_384_mont_sub_lower_7(ya, ya, xa, p384_mod); + sp_384_mont_sub_7(ys, ya, xs, p384_mod); + sp_384_mont_sub_7(ya, ya, xa, p384_mod); sp_384_mont_mul_7(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_7(t6, p384_mod, t6); sp_384_mont_mul_7(ys, ys, t6, p384_mod, p384_mp_mod); @@ -30479,7 +30507,7 @@ static int sp_384_ecc_mulmod_win_add_sub_7(sp_point_384* r, const sp_point_384* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * + t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -30598,12 +30626,12 @@ static int sp_384_ecc_mulmod_win_add_sub_7(sp_point_384* r, const sp_point_384* static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*7; - sp_digit* t3 = t + 4*7; - sp_digit* t4 = t + 6*7; - sp_digit* t5 = t + 8*7; - sp_digit* t6 = t + 10*7; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*7; + sp_digit* t6 = t + 4*7; + sp_digit* t1 = t + 6*7; + sp_digit* t4 = t + 8*7; + sp_digit* t5 = t + 10*7; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -30619,13 +30647,9 @@ static void sp_384_proj_point_add_qz1_7(sp_point_384* r, sp_384_proj_point_dbl_7(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_7(t2, t2, p->x, p384_mod); @@ -30634,33 +30658,40 @@ static void sp_384_proj_point_add_qz1_7(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_7(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_7(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_7(x, t1, t5, p384_mod); - sp_384_mont_dbl_7(t1, t3, p384_mod); - sp_384_mont_sub_7(x, x, t1, p384_mod); + sp_384_mont_sqr_7(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_7(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_7(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_7(t2, t2, t1, p384_mod); + sp_384_mont_dbl_7(t5, t3, p384_mod); + sp_384_mont_sub_7(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_7(t3, t3, x, p384_mod); + sp_384_mont_sub_7(t3, t3, x, p384_mod); sp_384_mont_mul_7(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_7(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_7(y, t3, t5, p384_mod); + sp_384_mont_mul_7(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_7(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 7; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 7; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 7; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 7; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 7; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -31173,7 +31204,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -33162,7 +33193,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -33316,7 +33347,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -33324,7 +33355,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -35910,14 +35941,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 58 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 57); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 57); } #elif DIGIT_BIT > 58 unsigned int i; @@ -36622,7 +36653,6 @@ static void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_521_norm_9(r); } -#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 /* Shift number left one bit. * Bottom bit is lost. * @@ -36717,7 +36747,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -36839,7 +36869,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -36901,12 +36931,12 @@ static int sp_521_iszero_9(const sp_digit* a) static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*9; + sp_digit* t2 = t + 4*9; + sp_digit* t3 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); @@ -36928,17 +36958,9 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); @@ -36957,20 +36979,31 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_mont_dbl_9(t3, y, p521_mod); sp_521_mont_sub_9(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -37016,12 +37049,12 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*9; - ctx->t3 = t + 4*9; - ctx->t4 = t + 6*9; - ctx->t5 = t + 8*9; - ctx->t6 = t + 10*9; + ctx->t6 = t; + ctx->t1 = t + 2*9; + ctx->t2 = t + 4*9; + ctx->t3 = t + 6*9; + ctx->t4 = t + 8*9; + ctx->t5 = t + 10*9; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -37128,7 +37161,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -37141,22 +37174,28 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -37471,8 +37510,6 @@ static void sp_521_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m) #endif /* WOLFSSL_SP_SMALL */ } -#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 -#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -37511,7 +37548,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -37520,8 +37557,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -37541,7 +37578,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -37550,8 +37587,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -37607,7 +37644,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -37617,8 +37654,8 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; @@ -37706,8 +37743,8 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); - sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_sub_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_9(ya, ya, xa, p521_mod); sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); @@ -37899,7 +37936,7 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * + t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -38018,12 +38055,12 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*9; + sp_digit* t6 = t + 4*9; + sp_digit* t1 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -38039,13 +38076,9 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_9(t2, t2, p->x, p521_mod); @@ -38054,33 +38087,40 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, t1, t5, p521_mod); - sp_521_mont_dbl_9(t1, t3, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_sqr_9(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(t2, t2, t1, p521_mod); + sp_521_mont_dbl_9(t5, t3, p521_mod); + sp_521_mont_sub_9(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); + sp_521_mont_sub_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, t3, t5, p521_mod); + sp_521_mont_mul_9(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -38601,7 +38641,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -40590,7 +40630,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -40745,7 +40785,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -40753,7 +40793,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -43886,14 +43926,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 57 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 56); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 56); } #elif DIGIT_BIT > 57 unsigned int i; @@ -44492,7 +44532,6 @@ static void sp_1024_mont_sub_18(sp_digit* r, const sp_digit* a, const sp_digit* sp_1024_norm_18(r); } -#define sp_1024_mont_sub_lower_18 sp_1024_mont_sub_18 /* Shift number left one bit. * Bottom bit is lost. * @@ -44596,7 +44635,7 @@ static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_18(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_18(y, y, x, p1024_mod); + sp_1024_mont_sub_18(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_18(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -44718,7 +44757,7 @@ static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -44784,12 +44823,12 @@ static int sp_1024_iszero_18(const sp_digit* a) static void sp_1024_proj_point_add_18(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*18; - sp_digit* t3 = t + 4*18; - sp_digit* t4 = t + 6*18; - sp_digit* t5 = t + 8*18; - sp_digit* t6 = t + 10*18; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*18; + sp_digit* t2 = t + 4*18; + sp_digit* t3 = t + 6*18; + sp_digit* t4 = t + 8*18; + sp_digit* t5 = t + 10*18; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_18(t1, q->z, p1024_mod, p1024_mp_mod); @@ -44811,17 +44850,9 @@ static void sp_1024_proj_point_add_18(sp_point_1024* r, sp_1024_proj_point_dbl_18(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_18(t2, t2, t1, p1024_mod); @@ -44840,20 +44871,31 @@ static void sp_1024_proj_point_add_18(sp_point_1024* r, sp_1024_mont_dbl_18(t3, y, p1024_mod); sp_1024_mont_sub_18(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_18(y, y, x, p1024_mod); + sp_1024_mont_sub_18(y, y, x, p1024_mod); sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(y, y, t5, p1024_mod); - for (i = 0; i < 18; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 18; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 18; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -44899,12 +44941,12 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*18; - ctx->t3 = t + 4*18; - ctx->t4 = t + 6*18; - ctx->t5 = t + 8*18; - ctx->t6 = t + 10*18; + ctx->t6 = t; + ctx->t1 = t + 2*18; + ctx->t2 = t + 4*18; + ctx->t3 = t + 6*18; + ctx->t4 = t + 8*18; + ctx->t5 = t + 10*18; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -45011,7 +45053,7 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -45024,22 +45066,28 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 18; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 18; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 18; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -45354,8 +45402,6 @@ static void sp_1024_cond_copy_18(sp_digit* r, const sp_digit* a, const sp_digit #endif /* WOLFSSL_SP_SMALL */ } -#define sp_1024_mont_dbl_lower_18 sp_1024_mont_dbl_18 -#define sp_1024_mont_tpl_lower_18 sp_1024_mont_tpl_18 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -45394,7 +45440,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); + sp_1024_mont_tpl_18(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45403,8 +45449,8 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i, sp_1024_mont_dbl_18(t2, b, p1024_mod); sp_1024_mont_sub_18(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); + sp_1024_mont_sub_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -45424,7 +45470,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); + sp_1024_mont_tpl_18(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45433,8 +45479,8 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i, sp_1024_mont_dbl_18(t2, b, p1024_mod); sp_1024_mont_sub_18(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); + sp_1024_mont_sub_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -45490,7 +45536,7 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_18(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_18(a, t1, p1024_mod); + sp_1024_mont_tpl_18(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_18(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_18(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45500,8 +45546,8 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r, sp_1024_mont_dbl_18(t2, b, p1024_mod); sp_1024_mont_sub_18(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_18(b, t2, p1024_mod); + sp_1024_mont_sub_18(t2, b, x, p1024_mod); + sp_1024_mont_dbl_18(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_18(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; @@ -45589,8 +45635,8 @@ static void sp_1024_proj_point_add_sub_18(sp_point_1024* ra, sp_1024_mont_sub_18(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_lower_18(ys, ya, xs, p1024_mod); - sp_1024_mont_sub_lower_18(ya, ya, xa, p1024_mod); + sp_1024_mont_sub_18(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_18(ya, ya, xa, p1024_mod); sp_1024_mont_mul_18(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_18(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_18(ys, ys, t6, p1024_mod, p1024_mp_mod); @@ -45718,7 +45764,7 @@ static int sp_1024_ecc_mulmod_win_add_sub_18(sp_point_1024* r, const sp_point_10 (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * + t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * (65+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -45841,12 +45887,12 @@ static int sp_1024_ecc_mulmod_win_add_sub_18(sp_point_1024* r, const sp_point_10 static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*18; - sp_digit* t3 = t + 4*18; - sp_digit* t4 = t + 6*18; - sp_digit* t5 = t + 8*18; - sp_digit* t6 = t + 10*18; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*18; + sp_digit* t6 = t + 4*18; + sp_digit* t1 = t + 6*18; + sp_digit* t4 = t + 8*18; + sp_digit* t5 = t + 10*18; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -45862,13 +45908,9 @@ static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, sp_1024_proj_point_dbl_18(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_18(t2, t2, p->x, p1024_mod); @@ -45877,33 +45919,40 @@ static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_18(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_18(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_18(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_18(t1, t3, p1024_mod); - sp_1024_mont_sub_18(x, x, t1, p1024_mod); + sp_1024_mont_sqr_18(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_18(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_18(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_18(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_18(t5, t3, p1024_mod); + sp_1024_mont_sub_18(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_18(t3, t3, x, p1024_mod); + sp_1024_mont_sub_18(t3, t3, x, p1024_mod); sp_1024_mont_mul_18(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_18(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_18(y, t3, t5, p1024_mod); + sp_1024_mont_mul_18(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_18(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 18; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 18; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 18; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 18; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 18; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -49796,7 +49845,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index 51e936dee..bf3f90a77 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -127,14 +127,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -239,678 +239,617 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) #define sp_2048_norm_64(a) #ifndef WOLFSSL_SP_SMALL +#ifdef WOLFSSL_SP_NO_UMAAL /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[8]; - sp_digit* tmp = tmp_arr; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( + "SUB sp, sp, #0x24\n\t" + "STR %[r], [sp, #32]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" /* A[0] * B[0] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[1] */ - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - /* A[1] * B[0] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[b]]\n\t" + "UMULL r3, r4, r12, lr\n\t" /* A[0] * B[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[1] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[0] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[1] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[0] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[b], #8]\n\t" + "UMULL r5, r6, r12, lr\n\t" /* A[0] * B[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[1] * B[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[2] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[1] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[0] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[2] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[1] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[0] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" + "LDR lr, [%[b], #16]\n\t" + "UMULL r7, r8, r12, lr\n\t" /* A[0] * B[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[3] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[2] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[1] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[0] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "UMULL r9, r10, r12, lr\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "MOV r11, %[r]\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[0] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[0] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" /* A[0] * B[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r3, %[r], #0x0\n\t" + "UMLAL r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "STR r4, [sp, #4]\n\t" + "ADDS r5, r5, r11\n\t" + /* A[1] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[1] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" /* A[1] * B[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[3] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[2] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[1] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[0] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" /* A[1] * B[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "STR r5, [sp, #8]\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[2] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[2] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" /* A[2] * B[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[4] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[3] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[2] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[1] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" /* A[2] * B[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[3] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[3] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[3] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" /* A[3] * B[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[4] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[3] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[2] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" /* A[3] * B[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[4] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[4] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[4] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[4] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[4] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" /* A[4] * B[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[5] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[4] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[3] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" /* A[4] * B[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[5] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[5] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[5] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[5] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" /* A[5] * B[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[5] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[4] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" /* A[5] * B[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[6] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[6] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[6] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[6] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" /* A[6] * B[6] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[5] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" /* A[6] * B[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r9, %[r], #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "LDR r12, [%[a], #28]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[7] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[7] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[7] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[7] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[7] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" /* A[7] * B[6] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" /* A[7] * B[7] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" + "LDR lr, [%[b], #28]\n\t" + "ADC r10, %[r], #0x0\n\t" + "UMLAL r9, r10, r12, lr\n\t" + "LDR %[r], [sp, #32]\n\t" + "ADD %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x24\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_2048_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x2c\n\t" + "STRD %[r], %[a], [sp, #36]\n\t" + "MOV lr, %[b]\n\t" + "LDM %[a], {%[r], %[a], %[b], r3}\n\t" + "LDM lr!, {r4, r5, r6}\n\t" + "UMULL r10, r11, %[r], r4\n\t" + "UMULL r12, r7, %[a], r4\n\t" + "UMAAL r11, r12, %[r], r5\n\t" + "UMULL r8, r9, %[b], r4\n\t" + "UMAAL r12, r8, %[a], r5\n\t" + "UMAAL r12, r7, %[r], r6\n\t" + "UMAAL r8, r9, r3, r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + "UMAAL r7, r8, %[b], r5\n\t" + "LDM lr!, {r4}\n\t" + "UMULL r10, r11, %[a], r6\n\t" + "UMAAL r8, r9, %[b], r6\n\t" + "UMAAL r7, r10, %[r], r4\n\t" + "UMAAL r8, r11, r3, r5\n\t" + "STR r7, [sp, #12]\n\t" + "UMAAL r8, r10, %[a], r4\n\t" + "UMAAL r9, r11, r3, r6\n\t" + "UMAAL r9, r10, %[b], r4\n\t" + "UMAAL r10, r11, r3, r4\n\t" + "LDM lr, {r4, r5, r6, r7}\n\t" + "MOV r12, #0x0\n\t" + "UMLAL r8, r12, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r10, r12, %[b], r4\n\t" + "UMAAL r11, r12, r3, r4\n\t" + "MOV r4, #0x0\n\t" + "UMLAL r9, r4, %[r], r5\n\t" + "UMAAL r10, r4, %[a], r5\n\t" + "UMAAL r11, r4, %[b], r5\n\t" + "UMAAL r12, r4, r3, r5\n\t" + "MOV r5, #0x0\n\t" + "UMLAL r10, r5, %[r], r6\n\t" + "UMAAL r11, r5, %[a], r6\n\t" + "UMAAL r12, r5, %[b], r6\n\t" + "UMAAL r4, r5, r3, r6\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r11, r6, %[r], r7\n\t" + "LDR %[r], [sp, #40]\n\t" + "UMAAL r12, r6, %[a], r7\n\t" + "ADD %[r], %[r], #0x10\n\t" + "UMAAL r4, r6, %[b], r7\n\t" + "SUB lr, lr, #0x10\n\t" + "UMAAL r5, r6, r3, r7\n\t" + "LDM %[r], {%[r], %[a], %[b], r3}\n\t" + "STR r6, [sp, #32]\n\t" + "LDM lr!, {r6}\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r8, r7, %[r], r6\n\t" + "UMAAL r9, r7, %[a], r6\n\t" + "STR r8, [sp, #16]\n\t" + "UMAAL r10, r7, %[b], r6\n\t" + "UMAAL r11, r7, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r8, #0x0\n\t" + "UMLAL r9, r8, %[r], r6\n\t" + "UMAAL r10, r8, %[a], r6\n\t" + "STR r9, [sp, #20]\n\t" + "UMAAL r11, r8, %[b], r6\n\t" + "UMAAL r12, r8, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r10, r9, %[r], r6\n\t" + "UMAAL r11, r9, %[a], r6\n\t" + "STR r10, [sp, #24]\n\t" + "UMAAL r12, r9, %[b], r6\n\t" + "UMAAL r4, r9, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r10, #0x0\n\t" + "UMLAL r11, r10, %[r], r6\n\t" + "UMAAL r12, r10, %[a], r6\n\t" + "STR r11, [sp, #28]\n\t" + "UMAAL r4, r10, %[b], r6\n\t" + "UMAAL r5, r10, r3, r6\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r12, r7, %[r], r11\n\t" + "UMAAL r4, r7, %[a], r11\n\t" + "LDR r6, [sp, #32]\n\t" + "UMAAL r5, r7, %[b], r11\n\t" + "UMAAL r6, r7, r3, r11\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r4, r8, %[r], r11\n\t" + "UMAAL r5, r8, %[a], r11\n\t" + "UMAAL r6, r8, %[b], r11\n\t" + "UMAAL r7, r8, r3, r11\n\t" + "LDM lr, {r11, lr}\n\t" + "UMAAL r5, r9, %[r], r11\n\t" + "UMAAL r6, r10, %[r], lr\n\t" + "UMAAL r6, r9, %[a], r11\n\t" + "UMAAL r7, r10, %[a], lr\n\t" + "UMAAL r7, r9, %[b], r11\n\t" + "UMAAL r8, r10, %[b], lr\n\t" + "UMAAL r8, r9, r3, r11\n\t" + "UMAAL r9, r10, r3, lr\n\t" + "MOV r3, r12\n\t" + "LDR lr, [sp, #36]\n\t" + "ADD lr, lr, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB lr, lr, #0x20\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x2c\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); +} + +#endif /* WOLFSSL_SP_NO_UMAAL */ +/* Add b to a into r. (r = a + b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static sp_digit sp_2048_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +/* Sub b from a into a. (a -= b) + * + * a A single precision integer and result. + * b A single precision integer. + */ +static sp_digit sp_2048_sub_in_place_16(sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)a; +} + /* Add b to a into r. (r = a + b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; -} - -/* Sub b from a into r. (r = a - b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6" - ); - - return c; -} - -/* Add b to a into r. (r = a + b) - * - * r A single precision integer. - * a A single precision integer. - * b A single precision integer. - */ -SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) -{ - sp_digit c = 0; - - __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r4", "r5", "r6", "r8" - ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -978,105 +917,79 @@ SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a, (void)sp_2048_add_8(r + 24, r + 24, a1); } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -1085,100 +998,76 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -1250,185 +1139,135 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, (void)sp_2048_add_16(r + 48, r + 48, a1); } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -1437,180 +1276,132 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -1682,396 +1473,394 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, (void)sp_2048_add_32(r + 96, r + 96, a1); } +#ifdef WOLFSSL_SP_NO_UMAAL /* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) +static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) { - sp_digit tmp_arr[8]; - sp_digit* tmp = tmp_arr; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - /* A[0] * A[0] */ - "ldr r6, [%[a], #0]\n\t" - "umull r3, r4, r6, r6\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" + "SUB sp, sp, #0x44\n\t" + "STR %[r], [sp, #64]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" /* A[0] * A[1] */ - "ldr r8, [%[a], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" + "LDR lr, [%[a], #4]\n\t" + "UMULL r4, r5, r12, lr\n\t" /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[2] */ - "ldr r6, [%[a], #8]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[a], #12]\n\t" + "UMULL r6, r7, r12, lr\n\t" /* A[0] * A[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[3] */ - "ldr r6, [%[a], #12]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[a], #20]\n\t" + "UMULL r8, r9, r12, lr\n\t" /* A[0] * A[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "UMULL r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[0] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[0] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + "STR r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * A[3] */ + "LDR lr, [%[a], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" /* A[1] * A[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[a], #12]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[4] */ - "ldr r6, [%[a], #16]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" /* A[2] * A[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[a], #16]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" /* A[3] * A[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[a], #20]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[5] * A[5] */ - "ldr r6, [%[a], #20]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" /* A[4] * A[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" /* A[5] * A[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[a], #24]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * A[6] */ - "ldr r6, [%[a], #24]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" + "LDR lr, [%[a], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" /* A[6] * A[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[a], #28]\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "STM lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADC r10, %[r], #0x0\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV lr, sp\n\t" + /* A[0] * A[0] */ + "LDR r12, [%[a]]\n\t" + "UMULL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[1] * A[1] */ + "LDR r12, [%[a], #4]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * A[2] */ + "LDR r12, [%[a], #8]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * A[3] */ + "LDR r12, [%[a], #12]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, r12\n\t" + "ADDS r10, r10, r11\n\t" + "STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "LDR r12, [%[a], #16]\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * A[5] */ + "LDR r12, [%[a], #20]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * A[6] */ + "LDR r12, [%[a], #24]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" /* A[7] * A[7] */ - "ldr r6, [%[a], #28]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r12, r12\n\t" + "LDR %[r], [sp, #64]\n\t" + "ADD %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); } +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_2048_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x20\n\t" + "STR %[r], [sp, #28]\n\t" + "LDM %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "UMULL r9, r10, %[r], %[r]\n\t" + "UMULL r11, r12, %[r], %[a]\n\t" + "ADDS r11, r11, r11\n\t" + "MOV lr, #0x0\n\t" + "UMAAL r10, r11, lr, lr\n\t" + "STM sp, {r9, r10}\n\t" + "MOV r8, lr\n\t" + "UMAAL r8, r12, %[r], r2\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r8, r11, %[a], %[a]\n\t" + "UMULL r9, r10, %[r], r3\n\t" + "UMAAL r9, r12, %[a], r2\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STRD r8, r9, [sp, #8]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r10, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r2, r2\n\t" + "STR r9, [sp, #16]\n\t" + "UMULL r9, r8, %[r], r5\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r9, r10, r2, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STR r9, [sp, #20]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r8, %[r], r6\n\t" + "UMAAL r9, r12, %[a], r5\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r3, r3\n\t" + "STR r9, [sp, #24]\n\t" + "UMULL %[r], r9, %[r], r7\n\t" + "UMAAL %[r], r8, %[a], r6\n\t" + "UMAAL %[r], r12, r2, r5\n\t" + "UMAAL %[r], r10, r3, r4\n\t" + "ADCS %[r], %[r], %[r]\n\t" + "UMAAL %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "UMAAL r9, r8, %[a], r7\n\t" + "UMAAL r9, r10, r2, r6\n\t" + "UMAAL r12, r9, r3, r5\n\t" + "ADCS r12, r12, r12\n\t" + "UMAAL r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "UMAAL r9, r8, r2, r7\n\t" + "UMAAL r10, r9, r3, r6\n\t" + "MOV r2, lr\n\t" + "UMAAL r10, r2, r4, r5\n\t" + "ADCS r10, r10, r10\n\t" + "UMAAL r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "UMAAL r2, r8, r3, r7\n\t" + "UMAAL r2, r9, r4, r6\n\t" + "ADCS r3, r2, r2\n\t" + "UMAAL r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "MOV %[a], lr\n\t" + "UMAAL %[a], r8, r4, r7\n\t" + "UMAAL %[a], r9, r5, r6\n\t" + "ADCS r4, %[a], %[a]\n\t" + "UMAAL r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "UMAAL r8, r9, r5, r7\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "MOV r5, lr\n\t" + "UMAAL r5, r9, r6, r7\n\t" + "ADCS r5, r5, r5\n\t" + "UMAAL r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r5, r7, r7\n\t" + "ADCS r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "LDR lr, [sp, #28]\n\t" + "ADD lr, lr, #0x1c\n\t" + "STM lr!, {%[r], r12}\n\t" + "STM lr!, {r11}\n\t" + "STM lr!, {r10}\n\t" + "STM lr!, {r3, r4, r8, r9}\n\t" + "STM lr!, {r7}\n\t" + "SUB lr, lr, #0x40\n\t" + "LDM sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "STM lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "ADD sp, sp, #0x20\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* WOLFSSL_SP_NO_UMAAL */ /* Sub b from a into r. (r = a - b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -2116,59 +1905,47 @@ SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -2213,99 +1990,75 @@ SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -2352,39 +2105,39 @@ SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #256\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x100\n\t" + "\n" + "L_sp_2048_add_64_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_2048_add_64_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_2048_add_64_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -2394,39 +2147,37 @@ SP_NOINLINE static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_in_place_64(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #256\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x100\n\t" + "\n" + "L_sp_2048_sub_in_pkace_64_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_2048_sub_in_pkace_64_word_%=\n\t" +#else + "BNE.N L_sp_2048_sub_in_pkace_64_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ @@ -2437,86 +2188,74 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_64(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[64 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #252\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x200\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_2048_mul_64_outer_%=:\n\t" + "SUBS r3, r5, #0xfc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_2048_mul_64_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x100\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_2048_mul_64_inner_done_%=\n\t" +#else + "BEQ.N L_sp_2048_mul_64_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_2048_mul_64_inner_%=\n\t" +#else + "BLE.N L_sp_2048_mul_64_inner_%=\n\t" +#endif + "\n" + "L_sp_2048_mul_64_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x1f8\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_2048_mul_64_outer_%=\n\t" +#else + "BLE.N L_sp_2048_mul_64_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_2048_mul_64_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_2048_mul_64_store_%=\n\t" +#else + "BGT.N L_sp_2048_mul_64_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -2524,129 +2263,97 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) +static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #252\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x200\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_2048_sqr_64_outer_%=:\n\t" + "SUBS r3, r5, #0xfc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_2048_sqr_64_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_2048_sqr_64_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_2048_sqr_64_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_2048_sqr_64_op_done_%=\n\t" + "\n" + "L_sp_2048_sqr_64_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_2048_sqr_64_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_2048_sqr_64_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_2048_sqr_64_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_2048_sqr_64_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_2048_sqr_64_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_2048_sqr_64_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_2048_sqr_64_inner_%=\n\t" +#endif + "\n" + "L_sp_2048_sqr_64_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x1f8\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_2048_sqr_64_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_2048_sqr_64_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_2048_sqr_64_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_2048_sqr_64_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #1\n\t" - "lsl r3, r3, #8\n\t" - "add r3, r3, #252\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_2048_sqr_64_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -2676,39 +2383,39 @@ static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #128\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x80\n\t" + "\n" + "L_sp_2048_add_32_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_2048_add_32_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_2048_add_32_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -2718,39 +2425,37 @@ SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #128\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x80\n\t" + "\n" + "L_sp_2048_sub_in_pkace_32_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_2048_sub_in_pkace_32_word_%=\n\t" +#else + "BNE.N L_sp_2048_sub_in_pkace_32_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ @@ -2761,83 +2466,74 @@ SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[32 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #128\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #248\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x100\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_2048_mul_32_outer_%=:\n\t" + "SUBS r3, r5, #0x7c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_2048_mul_32_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_2048_mul_32_inner_done_%=\n\t" +#else + "BEQ.N L_sp_2048_mul_32_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_2048_mul_32_inner_%=\n\t" +#else + "BLE.N L_sp_2048_mul_32_inner_%=\n\t" +#endif + "\n" + "L_sp_2048_mul_32_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0xf8\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_2048_mul_32_outer_%=\n\t" +#else + "BLE.N L_sp_2048_mul_32_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_2048_mul_32_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_2048_mul_32_store_%=\n\t" +#else + "BGT.N L_sp_2048_mul_32_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -2845,124 +2541,97 @@ SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) +static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x100\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_2048_sqr_32_outer_%=:\n\t" + "SUBS r3, r5, #0x7c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_2048_sqr_32_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_2048_sqr_32_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_2048_sqr_32_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_2048_sqr_32_op_done_%=\n\t" + "\n" + "L_sp_2048_sqr_32_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_2048_sqr_32_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_2048_sqr_32_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #128\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_2048_sqr_32_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_2048_sqr_32_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_2048_sqr_32_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_2048_sqr_32_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_2048_sqr_32_inner_%=\n\t" +#endif + "\n" + "L_sp_2048_sqr_32_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0xf8\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_2048_sqr_32_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #248\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_2048_sqr_32_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_2048_sqr_32_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_2048_sqr_32_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #252\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_2048_sqr_32_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -2989,48 +2658,394 @@ static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho) *rho = (sp_digit)0 - x; } +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #256\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_2048_mul_d_64_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_mul_d_64_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_2048_mul_d_64_word_%=\n\t" +#endif + "STR r3, [%[r], #256]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_2048_mul_d_64(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[32] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[33] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[34] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[35] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[36] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[37] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[38] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[39] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[40] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[41] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[42] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[43] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[44] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[45] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[46] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[47] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[48] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[49] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[50] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[51] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[52] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[53] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[54] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[55] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[56] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[57] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[58] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[59] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[60] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[61] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[62] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[63] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "STR r4, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. @@ -3046,6 +3061,7 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) sp_2048_sub_in_place_32(r, m); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -3054,141 +3070,689 @@ static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #128\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_2048_cond_sub_32_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_cond_sub_32_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_2048_cond_sub_32_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #128\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_2048_mont_reduce_32_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #120\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+30] += m[30] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+31] += m[31] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[31] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[31] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #120\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "LDR r12, [%[a], #128]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_2048_mont_reduce_32_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca); + sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_2048_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_2048_mont_reduce_32_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #128]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #124]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #128]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" +#else + "BLT.N L_sp_2048_mont_reduce_32_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -3219,48 +3783,235 @@ SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a, sp_2048_mont_reduce_32(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #128\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_2048_mul_d_32_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_mul_d_32_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_2048_mul_d_32_word_%=\n\t" +#endif + "STR r3, [%[r], #128]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_2048_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "STR r5, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -3270,49 +4021,122 @@ SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_2048_word_32_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_2048_word_32_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* Compare a with b in constant time. * * a A single precision integer. @@ -3320,44 +4144,395 @@ SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_2048_cmp_32(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #124\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x7c\n\t" + "\n" + "L_sp_2048_cmp_32_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_2048_cmp_32_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -3756,6 +4931,7 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) } #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -3764,142 +4940,1217 @@ static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #1\n\t" - "lsl r5, r5, #8\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_2048_cond_sub_64_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_cond_sub_64_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_2048_cond_sub_64_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 2048 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #256\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_2048_mont_reduce_64_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #248\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r9, [%[m], #128]\n\t" + "LDR r12, [%[a], #128]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r9, [%[m], #132]\n\t" + "LDR r12, [%[a], #132]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #132]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r9, [%[m], #136]\n\t" + "LDR r12, [%[a], #136]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #136]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r9, [%[m], #140]\n\t" + "LDR r12, [%[a], #140]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #140]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r9, [%[m], #144]\n\t" + "LDR r12, [%[a], #144]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #144]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r9, [%[m], #148]\n\t" + "LDR r12, [%[a], #148]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #148]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r9, [%[m], #152]\n\t" + "LDR r12, [%[a], #152]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #152]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r9, [%[m], #156]\n\t" + "LDR r12, [%[a], #156]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #156]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r9, [%[m], #160]\n\t" + "LDR r12, [%[a], #160]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #160]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r9, [%[m], #164]\n\t" + "LDR r12, [%[a], #164]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #164]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r9, [%[m], #168]\n\t" + "LDR r12, [%[a], #168]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #168]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r9, [%[m], #172]\n\t" + "LDR r12, [%[a], #172]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #172]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r9, [%[m], #176]\n\t" + "LDR r12, [%[a], #176]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #176]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r9, [%[m], #180]\n\t" + "LDR r12, [%[a], #180]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #180]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r9, [%[m], #184]\n\t" + "LDR r12, [%[a], #184]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #184]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r9, [%[m], #188]\n\t" + "LDR r12, [%[a], #188]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #188]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r9, [%[m], #192]\n\t" + "LDR r12, [%[a], #192]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #192]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r9, [%[m], #196]\n\t" + "LDR r12, [%[a], #196]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #196]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r9, [%[m], #200]\n\t" + "LDR r12, [%[a], #200]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #200]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r9, [%[m], #204]\n\t" + "LDR r12, [%[a], #204]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #204]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r9, [%[m], #208]\n\t" + "LDR r12, [%[a], #208]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #208]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r9, [%[m], #212]\n\t" + "LDR r12, [%[a], #212]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #212]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r9, [%[m], #216]\n\t" + "LDR r12, [%[a], #216]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #216]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r9, [%[m], #220]\n\t" + "LDR r12, [%[a], #220]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #220]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r9, [%[m], #224]\n\t" + "LDR r12, [%[a], #224]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #224]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r9, [%[m], #228]\n\t" + "LDR r12, [%[a], #228]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #228]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r9, [%[m], #232]\n\t" + "LDR r12, [%[a], #232]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #232]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r9, [%[m], #236]\n\t" + "LDR r12, [%[a], #236]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #236]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r9, [%[m], #240]\n\t" + "LDR r12, [%[a], #240]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #240]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r9, [%[m], #244]\n\t" + "LDR r12, [%[a], #244]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #244]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+62] += m[62] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #248]\n\t" + "LDR r12, [%[a], #248]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #248]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+63] += m[63] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[63] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[63] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #248\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #252]\n\t" + "LDR r12, [%[a], #252]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #252]\n\t" + "LDR r12, [%[a], #256]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #256]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_2048_mont_reduce_64_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - ca); + sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 2048 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_2048_mont_reduce_64(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_2048_mont_reduce_64_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r12, [%[m], #128]\n\t" + "LDR r11, [%[a], #128]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r12, [%[m], #132]\n\t" + "LDR r11, [%[a], #132]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r12, [%[m], #136]\n\t" + "LDR r11, [%[a], #136]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r12, [%[m], #140]\n\t" + "LDR r11, [%[a], #140]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r12, [%[m], #144]\n\t" + "LDR r11, [%[a], #144]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r12, [%[m], #148]\n\t" + "LDR r11, [%[a], #148]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r12, [%[m], #152]\n\t" + "LDR r11, [%[a], #152]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r12, [%[m], #156]\n\t" + "LDR r11, [%[a], #156]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r12, [%[m], #160]\n\t" + "LDR r11, [%[a], #160]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r12, [%[m], #164]\n\t" + "LDR r11, [%[a], #164]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r12, [%[m], #168]\n\t" + "LDR r11, [%[a], #168]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r12, [%[m], #172]\n\t" + "LDR r11, [%[a], #172]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r12, [%[m], #176]\n\t" + "LDR r11, [%[a], #176]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r12, [%[m], #180]\n\t" + "LDR r11, [%[a], #180]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r12, [%[m], #184]\n\t" + "LDR r11, [%[a], #184]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r12, [%[m], #188]\n\t" + "LDR r11, [%[a], #188]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r12, [%[m], #192]\n\t" + "LDR r11, [%[a], #192]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r12, [%[m], #196]\n\t" + "LDR r11, [%[a], #196]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r12, [%[m], #200]\n\t" + "LDR r11, [%[a], #200]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r12, [%[m], #204]\n\t" + "LDR r11, [%[a], #204]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r12, [%[m], #208]\n\t" + "LDR r11, [%[a], #208]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r12, [%[m], #212]\n\t" + "LDR r11, [%[a], #212]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r12, [%[m], #216]\n\t" + "LDR r11, [%[a], #216]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r12, [%[m], #220]\n\t" + "LDR r11, [%[a], #220]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r12, [%[m], #224]\n\t" + "LDR r11, [%[a], #224]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r12, [%[m], #228]\n\t" + "LDR r11, [%[a], #228]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r12, [%[m], #232]\n\t" + "LDR r11, [%[a], #232]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r12, [%[m], #236]\n\t" + "LDR r11, [%[a], #236]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r12, [%[m], #240]\n\t" + "LDR r11, [%[a], #240]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r12, [%[m], #244]\n\t" + "LDR r11, [%[a], #244]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "LDR r12, [%[m], #248]\n\t" + "LDR r11, [%[a], #248]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "LDR r12, [%[m], #252]\n\t" + "LDR r11, [%[a], #252]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #256]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #252]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #256]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x100\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" +#else + "BLT.N L_sp_2048_mont_reduce_64_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -3937,39 +6188,38 @@ SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r5, #1\n\t" - "lsl r5, r5, #8\n\t" - "add r6, r6, r5\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x100\n\t" + "\n" + "L_sp_2048_sub_64_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_2048_sub_64_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_2048_sub_64_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -3979,182 +6229,135 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_2048_sub_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -4164,49 +6367,122 @@ SP_NOINLINE static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_2048_word_64_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_2048_word_64_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -4279,6 +6555,7 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const s } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -4316,44 +6593,747 @@ static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_2048_cmp_64(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #252\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0xfc\n\t" + "\n" + "L_sp_2048_cmp_64_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_2048_cmp_64_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #252]\n\t" + "LDR r5, [%[b], #252]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #248]\n\t" + "LDR r5, [%[b], #248]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #244]\n\t" + "LDR r5, [%[b], #244]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #240]\n\t" + "LDR r5, [%[b], #240]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #236]\n\t" + "LDR r5, [%[b], #236]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #232]\n\t" + "LDR r5, [%[b], #232]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #228]\n\t" + "LDR r5, [%[b], #228]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #224]\n\t" + "LDR r5, [%[b], #224]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #220]\n\t" + "LDR r5, [%[b], #220]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #216]\n\t" + "LDR r5, [%[b], #216]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #212]\n\t" + "LDR r5, [%[b], #212]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #208]\n\t" + "LDR r5, [%[b], #208]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #204]\n\t" + "LDR r5, [%[b], #204]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #200]\n\t" + "LDR r5, [%[b], #200]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #196]\n\t" + "LDR r5, [%[b], #196]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #192]\n\t" + "LDR r5, [%[b], #192]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #188]\n\t" + "LDR r5, [%[b], #188]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #184]\n\t" + "LDR r5, [%[b], #184]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #180]\n\t" + "LDR r5, [%[b], #180]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #176]\n\t" + "LDR r5, [%[b], #176]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #172]\n\t" + "LDR r5, [%[b], #172]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #168]\n\t" + "LDR r5, [%[b], #168]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #164]\n\t" + "LDR r5, [%[b], #164]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #160]\n\t" + "LDR r5, [%[b], #160]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #156]\n\t" + "LDR r5, [%[b], #156]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #152]\n\t" + "LDR r5, [%[b], #152]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #148]\n\t" + "LDR r5, [%[b], #148]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #144]\n\t" + "LDR r5, [%[b], #144]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #140]\n\t" + "LDR r5, [%[b], #140]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #136]\n\t" + "LDR r5, [%[b], #136]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #132]\n\t" + "LDR r5, [%[b], #132]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #128]\n\t" + "LDR r5, [%[b], #128]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -4411,6 +7391,7 @@ static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_dig return sp_2048_div_64(a, m, NULL, r); } +#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -4874,6 +7855,7 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -4882,39 +7864,180 @@ int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em, * b A single precision number to add. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #128\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r5, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "\n" + "L_sp_2048_cond_add_32_words_%=:\n\t" + "ADDS r5, r5, #0x-1\n\t" + "LDR r6, [%[a], r4]\n\t" + "LDR r7, [%[b], r4]\n\t" + "AND r7, r7, %[m]\n\t" + "ADCS r6, r6, r7\n\t" + "ADC r5, r8, r8\n\t" + "STR r6, [%[r], r4]\n\t" + "ADD r4, r4, #0x4\n\t" + "CMP r4, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_2048_cond_add_32_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_2048_cond_add_32_words_%=\n\t" +#endif + "MOV %[r], r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_2048_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "ADC %[r], r10, r10\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * * in Array of bytes representing the number to exponentiate, base. @@ -5229,406 +8352,401 @@ int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef WOLFSSL_HAVE_SP_DH #ifdef HAVE_FFDHE_2048 -static void sp_2048_lshift_64(sp_digit* r, const sp_digit* a, byte n) +static void sp_2048_lshift_64(sp_digit* r_p, const sp_digit* a_p, byte n_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; + __asm__ __volatile__ ( - "mov r6, #31\n\t" - "sub r6, r6, %[n]\n\t" - "add %[a], %[a], #192\n\t" - "add %[r], %[r], #192\n\t" - "ldr r3, [%[a], #60]\n\t" - "lsr r4, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r4, r4, r6\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r4, [%[a], #60]\n\t" - "str r3, [%[r], #68]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #56]\n\t" - "str r2, [%[r], #64]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #52]\n\t" - "str r4, [%[r], #60]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #48]\n\t" - "str r3, [%[r], #56]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #44]\n\t" - "str r2, [%[r], #52]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #40]\n\t" - "str r4, [%[r], #48]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #36]\n\t" - "str r3, [%[r], #44]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #32]\n\t" - "str r2, [%[r], #40]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #28]\n\t" - "str r4, [%[r], #36]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #24]\n\t" - "str r3, [%[r], #32]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #20]\n\t" - "str r2, [%[r], #28]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #16]\n\t" - "str r4, [%[r], #24]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #12]\n\t" - "str r3, [%[r], #20]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #8]\n\t" - "str r2, [%[r], #16]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #4]\n\t" - "str r4, [%[r], #12]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #0]\n\t" - "str r3, [%[r], #8]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r2, [%[r], #68]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "str r3, [%[r]]\n\t" - "str r4, [%[r], #4]\n\t" + "RSB r7, %[n], #0x1f\n\t" + "LDR r5, [%[a], #252]\n\t" + "LSR r6, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r6, r6, r7\n\t" + "LDR r4, [%[a], #248]\n\t" + "STR r6, [%[r], #256]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #244]\n\t" + "STR r5, [%[r], #252]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #240]\n\t" + "STR r4, [%[r], #248]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #236]\n\t" + "STR r6, [%[r], #244]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #232]\n\t" + "STR r5, [%[r], #240]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #228]\n\t" + "STR r4, [%[r], #236]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #224]\n\t" + "STR r6, [%[r], #232]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #220]\n\t" + "STR r5, [%[r], #228]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #216]\n\t" + "STR r4, [%[r], #224]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #212]\n\t" + "STR r6, [%[r], #220]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #208]\n\t" + "STR r5, [%[r], #216]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #204]\n\t" + "STR r4, [%[r], #212]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #200]\n\t" + "STR r6, [%[r], #208]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #196]\n\t" + "STR r5, [%[r], #204]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #192]\n\t" + "STR r4, [%[r], #200]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #188]\n\t" + "STR r6, [%[r], #196]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #184]\n\t" + "STR r5, [%[r], #192]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #180]\n\t" + "STR r4, [%[r], #188]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #176]\n\t" + "STR r6, [%[r], #184]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #172]\n\t" + "STR r5, [%[r], #180]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #168]\n\t" + "STR r4, [%[r], #176]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #164]\n\t" + "STR r6, [%[r], #172]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #160]\n\t" + "STR r5, [%[r], #168]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #156]\n\t" + "STR r4, [%[r], #164]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #152]\n\t" + "STR r6, [%[r], #160]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #148]\n\t" + "STR r5, [%[r], #156]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #144]\n\t" + "STR r4, [%[r], #152]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #140]\n\t" + "STR r6, [%[r], #148]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #136]\n\t" + "STR r5, [%[r], #144]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #132]\n\t" + "STR r4, [%[r], #140]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #128]\n\t" + "STR r6, [%[r], #136]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #124]\n\t" + "STR r5, [%[r], #132]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #120]\n\t" + "STR r4, [%[r], #128]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #116]\n\t" + "STR r6, [%[r], #124]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #112]\n\t" + "STR r5, [%[r], #120]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #108]\n\t" + "STR r4, [%[r], #116]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #104]\n\t" + "STR r6, [%[r], #112]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #100]\n\t" + "STR r5, [%[r], #108]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #96]\n\t" + "STR r4, [%[r], #104]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #92]\n\t" + "STR r6, [%[r], #100]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #88]\n\t" + "STR r5, [%[r], #96]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #84]\n\t" + "STR r4, [%[r], #92]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #80]\n\t" + "STR r6, [%[r], #88]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #76]\n\t" + "STR r5, [%[r], #84]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #72]\n\t" + "STR r4, [%[r], #80]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #68]\n\t" + "STR r6, [%[r], #76]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #64]\n\t" + "STR r5, [%[r], #72]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #60]\n\t" + "STR r4, [%[r], #68]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #56]\n\t" + "STR r6, [%[r], #64]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #52]\n\t" + "STR r5, [%[r], #60]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #48]\n\t" + "STR r4, [%[r], #56]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #44]\n\t" + "STR r6, [%[r], #52]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #40]\n\t" + "STR r5, [%[r], #48]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #36]\n\t" + "STR r4, [%[r], #44]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #32]\n\t" + "STR r6, [%[r], #40]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #28]\n\t" + "STR r5, [%[r], #36]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #24]\n\t" + "STR r4, [%[r], #32]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #20]\n\t" + "STR r6, [%[r], #28]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #16]\n\t" + "STR r5, [%[r], #24]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #12]\n\t" + "STR r4, [%[r], #20]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #8]\n\t" + "STR r6, [%[r], #16]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #4]\n\t" + "STR r5, [%[r], #12]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a]]\n\t" + "STR r4, [%[r], #8]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "STR r5, [%[r]]\n\t" + "STR r6, [%[r], #4]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" + : "memory", "r4", "r5", "r6", "r3", "r7" ); } @@ -5930,14 +9048,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -6048,83 +9166,1019 @@ static void sp_3072_to_bin_96(sp_digit* r, byte* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_3072_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[12 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #48\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #44\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #88\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x30\n\t" + /* A[0] * B[0] */ + "LDR r11, [%[a]]\n\t" + "LDR r12, [%[b]]\n\t" + "UMULL r3, r4, r11, r12\n\t" + "MOV r5, #0x0\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[0] */ + "LDR r8, [%[a], #4]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + /* A[2] * B[0] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[1] */ + "LDR r11, [%[a], #4]\n\t" + "LDR r12, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[2] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #8]\n\t" + /* A[0] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[2] */ + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[1] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[0] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #12]\n\t" + /* A[4] * B[0] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[1] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[2] */ + "LDR r11, [%[a], #8]\n\t" + "LDR r12, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[3] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[4] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #16]\n\t" + /* A[0] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[4] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[2] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[1] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[0] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #20]\n\t" + /* A[6] * B[0] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[1] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[2] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[3] */ + "LDR r11, [%[a], #12]\n\t" + "LDR r12, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[4] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[5] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[6] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #24]\n\t" + /* A[0] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[6] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[5] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[4] */ + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[3] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[2] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[1] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[0] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* A[8] * B[0] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[1] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[2] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[3] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[4] */ + "LDR r11, [%[a], #16]\n\t" + "LDR r12, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[5] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[6] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[7] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[8] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #32]\n\t" + /* A[0] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[8] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[7] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[6] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[4] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[3] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[2] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[1] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[0] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #36]\n\t" + /* A[10] * B[0] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[1] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[2] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[3] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[4] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[5] */ + "LDR r11, [%[a], #20]\n\t" + "LDR r12, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[6] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[7] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[8] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[9] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[10] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #40]\n\t" + /* A[0] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[10] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[9] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[8] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[7] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[6] */ + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[5] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[4] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[3] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[2] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[1] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[0] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #44]\n\t" + /* A[11] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[2] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[3] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[4] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[5] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[6] */ + "LDR r11, [%[a], #24]\n\t" + "LDR r12, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[7] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[8] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[9] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[10] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[11] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #48]\n\t" + /* A[2] * B[11] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[10] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[9] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[8] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[6] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[5] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[4] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[3] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[2] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #52]\n\t" + /* A[11] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[4] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[5] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[6] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[7] */ + "LDR r11, [%[a], #28]\n\t" + "LDR r12, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[8] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[9] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[10] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[11] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #56]\n\t" + /* A[4] * B[11] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[10] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[9] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[8] */ + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[7] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[6] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[5] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[4] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #60]\n\t" + /* A[11] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[6] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[7] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[8] */ + "LDR r11, [%[a], #32]\n\t" + "LDR r12, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[9] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[10] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[11] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #64]\n\t" + /* A[6] * B[11] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[10] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[8] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[7] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[6] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #68]\n\t" + /* A[11] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[8] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[9] */ + "LDR r11, [%[a], #36]\n\t" + "LDR r12, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[10] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[11] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #72]\n\t" + /* A[8] * B[11] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[10] */ + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[9] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[8] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #76]\n\t" + /* A[11] * B[9] */ + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[10] */ + "LDR r11, [%[a], #40]\n\t" + "LDR r12, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[11] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #80]\n\t" + /* A[10] * B[11] */ + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[10] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #84]\n\t" + /* A[11] * B[11] */ + "UMLAL r4, r5, r8, r9\n\t" + "STR r4, [%[r], #88]\n\t" + "STR r5, [%[r], #92]\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" + ); } /* Add b to a into r. (r = a + b) @@ -6133,131 +10187,102 @@ SP_NOINLINE static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_in_place_24(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -6266,80 +10291,62 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -6411,145 +10418,107 @@ SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a, (void)sp_3072_add_12(r + 36, r + 36, a1); } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -6558,140 +10527,104 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -6763,265 +10696,191 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, (void)sp_3072_add_24(r + 72, r + 72, a1); } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -7030,260 +10889,188 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -7360,122 +11147,692 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) +static void sp_3072_sqr_12(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #96\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #44\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" -#else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" -#else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #48\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" -#else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #88\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #92\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #96\n\t" - "add sp, sp, r6\n\t" + "SUB sp, sp, #0x30\n\t" + /* A[0] * A[0] */ + "LDR r10, [%[a]]\n\t" + "UMULL r8, r3, r10, r10\n\t" + "MOV r4, #0x0\n\t" + "STR r8, [sp]\n\t" + /* A[0] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #4]\n\t" + /* A[0] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #8]\n\t" + /* A[0] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [sp, #12]\n\t" + /* A[0] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[1] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[2] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #16]\n\t" + /* A[0] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #20]\n\t" + /* A[0] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #24]\n\t" + /* A[0] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #28]\n\t" + /* A[0] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #32]\n\t" + /* A[0] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #36]\n\t" + /* A[0] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #40]\n\t" + /* A[0] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #44]\n\t" + /* A[1] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[2] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #48]\n\t" + /* A[2] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[3] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #52]\n\t" + /* A[3] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[4] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #56]\n\t" + /* A[4] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[5] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #60]\n\t" + /* A[5] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[6] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #64]\n\t" + /* A[6] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[7] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #68]\n\t" + /* A[7] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #72]\n\t" + /* A[8] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[9] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [%[r], #76]\n\t" + /* A[9] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #80]\n\t" + /* A[10] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #84]\n\t" + /* A[11] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "UMLAL r3, r4, r10, r10\n\t" + "STR r3, [%[r], #88]\n\t" + "STR r4, [%[r], #92]\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" ); } @@ -7485,49 +11842,40 @@ SP_NOINLINE static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -7572,79 +11920,61 @@ SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_24(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -7689,139 +12019,103 @@ SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -7868,39 +12162,39 @@ SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #384\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x180\n\t" + "\n" + "L_sp_3072_add_96_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_3072_add_96_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_3072_add_96_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -7910,39 +12204,37 @@ SP_NOINLINE static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_in_place_96(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #384\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x180\n\t" + "\n" + "L_sp_3072_sub_in_pkace_96_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_3072_sub_in_pkace_96_word_%=\n\t" +#else + "BNE.N L_sp_3072_sub_in_pkace_96_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ @@ -7953,89 +12245,74 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_96(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[96 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #128\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x300\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_3072_mul_96_outer_%=:\n\t" + "SUBS r3, r5, #0x17c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_3072_mul_96_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x180\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_3072_mul_96_inner_done_%=\n\t" +#else + "BEQ.N L_sp_3072_mul_96_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_3072_mul_96_inner_%=\n\t" +#else + "BLE.N L_sp_3072_mul_96_inner_%=\n\t" +#endif + "\n" + "L_sp_3072_mul_96_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x2f8\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_3072_mul_96_outer_%=\n\t" +#else + "BLE.N L_sp_3072_mul_96_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_3072_mul_96_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_3072_mul_96_store_%=\n\t" +#else + "BGT.N L_sp_3072_mul_96_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -8043,132 +12320,97 @@ SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) +static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #3\n\t" - "lsl r6, r6, #8\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x300\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_3072_sqr_96_outer_%=:\n\t" + "SUBS r3, r5, #0x17c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_3072_sqr_96_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_3072_sqr_96_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_3072_sqr_96_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_3072_sqr_96_op_done_%=\n\t" + "\n" + "L_sp_3072_sqr_96_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_3072_sqr_96_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_3072_sqr_96_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #128\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_3072_sqr_96_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_3072_sqr_96_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_3072_sqr_96_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_3072_sqr_96_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_3072_sqr_96_inner_%=\n\t" +#endif + "\n" + "L_sp_3072_sqr_96_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x2f8\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_3072_sqr_96_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_3072_sqr_96_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_3072_sqr_96_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_3072_sqr_96_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #2\n\t" - "lsl r3, r3, #8\n\t" - "add r3, r3, #252\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #3\n\t" - "lsl r6, r6, #8\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_3072_sqr_96_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -8198,39 +12440,39 @@ static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #192\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0xc0\n\t" + "\n" + "L_sp_3072_add_48_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_3072_add_48_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_3072_add_48_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -8240,39 +12482,37 @@ SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_in_place_48(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #192\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0xc0\n\t" + "\n" + "L_sp_3072_sub_in_pkace_48_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_3072_sub_in_pkace_48_word_%=\n\t" +#else + "BNE.N L_sp_3072_sub_in_pkace_48_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ @@ -8283,85 +12523,74 @@ SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[48 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #192\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #188\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #120\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x180\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_3072_mul_48_outer_%=:\n\t" + "SUBS r3, r5, #0xbc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_3072_mul_48_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0xc0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_3072_mul_48_inner_done_%=\n\t" +#else + "BEQ.N L_sp_3072_mul_48_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_3072_mul_48_inner_%=\n\t" +#else + "BLE.N L_sp_3072_mul_48_inner_%=\n\t" +#endif + "\n" + "L_sp_3072_mul_48_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x178\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_3072_mul_48_outer_%=\n\t" +#else + "BLE.N L_sp_3072_mul_48_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_3072_mul_48_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_3072_mul_48_store_%=\n\t" +#else + "BGT.N L_sp_3072_mul_48_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -8369,130 +12598,97 @@ SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) +static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #128\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #188\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x180\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_3072_sqr_48_outer_%=:\n\t" + "SUBS r3, r5, #0xbc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_3072_sqr_48_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_3072_sqr_48_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_3072_sqr_48_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_3072_sqr_48_op_done_%=\n\t" + "\n" + "L_sp_3072_sqr_48_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_3072_sqr_48_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_3072_sqr_48_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #192\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_3072_sqr_48_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_3072_sqr_48_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_3072_sqr_48_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_3072_sqr_48_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_3072_sqr_48_inner_%=\n\t" +#endif + "\n" + "L_sp_3072_sqr_48_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x178\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_3072_sqr_48_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #120\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_3072_sqr_48_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_3072_sqr_48_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_3072_sqr_48_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #1\n\t" - "lsl r3, r3, #8\n\t" - "add r3, r3, #124\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #128\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_3072_sqr_48_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -8519,48 +12715,554 @@ static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho) *rho = (sp_digit)0 - x; } +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #384\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_3072_mul_d_96_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_mul_d_96_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_3072_mul_d_96_word_%=\n\t" +#endif + "STR r3, [%[r], #384]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_3072_mul_d_96(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[32] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[33] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[34] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[35] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[36] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[37] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[38] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[39] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[40] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[41] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[42] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[43] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[44] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[45] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[46] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[47] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[48] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[49] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[50] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[51] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[52] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[53] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[54] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[55] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[56] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[57] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[58] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[59] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[60] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[61] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[62] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[63] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[64] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[65] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[66] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[67] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[68] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[69] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[70] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[71] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[72] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[73] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[74] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[75] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[76] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[77] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[78] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[79] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[80] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[81] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[82] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[83] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[84] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[85] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[86] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[87] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[88] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[89] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[90] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[91] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[92] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[93] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[94] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[95] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "STR r3, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. @@ -8576,6 +13278,7 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) sp_3072_sub_in_place_48(r, m); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -8584,141 +13287,953 @@ static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #192\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_3072_cond_sub_48_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_cond_sub_48_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_3072_cond_sub_48_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_sub_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #192\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_3072_mont_reduce_48_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #184\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r9, [%[m], #128]\n\t" + "LDR r12, [%[a], #128]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r9, [%[m], #132]\n\t" + "LDR r12, [%[a], #132]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #132]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r9, [%[m], #136]\n\t" + "LDR r12, [%[a], #136]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #136]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r9, [%[m], #140]\n\t" + "LDR r12, [%[a], #140]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #140]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r9, [%[m], #144]\n\t" + "LDR r12, [%[a], #144]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #144]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r9, [%[m], #148]\n\t" + "LDR r12, [%[a], #148]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #148]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r9, [%[m], #152]\n\t" + "LDR r12, [%[a], #152]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #152]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r9, [%[m], #156]\n\t" + "LDR r12, [%[a], #156]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #156]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r9, [%[m], #160]\n\t" + "LDR r12, [%[a], #160]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #160]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r9, [%[m], #164]\n\t" + "LDR r12, [%[a], #164]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #164]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r9, [%[m], #168]\n\t" + "LDR r12, [%[a], #168]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #168]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r9, [%[m], #172]\n\t" + "LDR r12, [%[a], #172]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #172]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r9, [%[m], #176]\n\t" + "LDR r12, [%[a], #176]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #176]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r9, [%[m], #180]\n\t" + "LDR r12, [%[a], #180]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #180]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+46] += m[46] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #184]\n\t" + "LDR r12, [%[a], #184]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #184]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+47] += m[47] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[47] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[47] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #184\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #188]\n\t" + "LDR r12, [%[a], #188]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #188]\n\t" + "LDR r12, [%[a], #192]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #192]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_3072_mont_reduce_48_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca); + sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_3072_mont_reduce_48(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_3072_mont_reduce_48_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r12, [%[m], #128]\n\t" + "LDR r11, [%[a], #128]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r12, [%[m], #132]\n\t" + "LDR r11, [%[a], #132]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r12, [%[m], #136]\n\t" + "LDR r11, [%[a], #136]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r12, [%[m], #140]\n\t" + "LDR r11, [%[a], #140]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r12, [%[m], #144]\n\t" + "LDR r11, [%[a], #144]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r12, [%[m], #148]\n\t" + "LDR r11, [%[a], #148]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r12, [%[m], #152]\n\t" + "LDR r11, [%[a], #152]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r12, [%[m], #156]\n\t" + "LDR r11, [%[a], #156]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r12, [%[m], #160]\n\t" + "LDR r11, [%[a], #160]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r12, [%[m], #164]\n\t" + "LDR r11, [%[a], #164]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r12, [%[m], #168]\n\t" + "LDR r11, [%[a], #168]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r12, [%[m], #172]\n\t" + "LDR r11, [%[a], #172]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r12, [%[m], #176]\n\t" + "LDR r11, [%[a], #176]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r12, [%[m], #180]\n\t" + "LDR r11, [%[a], #180]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r12, [%[m], #184]\n\t" + "LDR r11, [%[a], #184]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r12, [%[m], #188]\n\t" + "LDR r11, [%[a], #188]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #192]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #188]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #192]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0xc0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" +#else + "BLT.N L_sp_3072_mont_reduce_48_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -8749,48 +14264,315 @@ SP_NOINLINE static void sp_3072_mont_sqr_48(sp_digit* r, const sp_digit* a, sp_3072_mont_reduce_48(r, m, mp); } +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #192\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_3072_mul_d_48_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_mul_d_48_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_3072_mul_d_48_word_%=\n\t" +#endif + "STR r3, [%[r], #192]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_3072_mul_d_48(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[32] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[33] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[34] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[35] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[36] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[37] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[38] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[39] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[40] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[41] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[42] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[43] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[44] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[45] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[46] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[47] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "STR r3, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -8800,49 +14582,122 @@ SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_3072_word_48_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_3072_word_48_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* Compare a with b in constant time. * * a A single precision integer. @@ -8850,44 +14705,571 @@ SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_3072_cmp_48(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #188\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0xbc\n\t" + "\n" + "L_sp_3072_cmp_48_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_3072_cmp_48_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #188]\n\t" + "LDR r5, [%[b], #188]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #184]\n\t" + "LDR r5, [%[b], #184]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #180]\n\t" + "LDR r5, [%[b], #180]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #176]\n\t" + "LDR r5, [%[b], #176]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #172]\n\t" + "LDR r5, [%[b], #172]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #168]\n\t" + "LDR r5, [%[b], #168]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #164]\n\t" + "LDR r5, [%[b], #164]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #160]\n\t" + "LDR r5, [%[b], #160]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #156]\n\t" + "LDR r5, [%[b], #156]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #152]\n\t" + "LDR r5, [%[b], #152]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #148]\n\t" + "LDR r5, [%[b], #148]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #144]\n\t" + "LDR r5, [%[b], #144]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #140]\n\t" + "LDR r5, [%[b], #140]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #136]\n\t" + "LDR r5, [%[b], #136]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #132]\n\t" + "LDR r5, [%[b], #132]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #128]\n\t" + "LDR r5, [%[b], #128]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -9286,6 +15668,7 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) } #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -9294,143 +15677,1745 @@ static void sp_3072_mont_norm_96(sp_digit* r, const sp_digit* m) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #1\n\t" - "lsl r5, r5, #8\n\t" - "add r5, r5, #128\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_3072_cond_sub_96_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_cond_sub_96_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_3072_cond_sub_96_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 3072 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #384\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_3072_mont_reduce_96_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #376\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r9, [%[m], #128]\n\t" + "LDR r12, [%[a], #128]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r9, [%[m], #132]\n\t" + "LDR r12, [%[a], #132]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #132]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r9, [%[m], #136]\n\t" + "LDR r12, [%[a], #136]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #136]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r9, [%[m], #140]\n\t" + "LDR r12, [%[a], #140]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #140]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r9, [%[m], #144]\n\t" + "LDR r12, [%[a], #144]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #144]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r9, [%[m], #148]\n\t" + "LDR r12, [%[a], #148]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #148]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r9, [%[m], #152]\n\t" + "LDR r12, [%[a], #152]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #152]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r9, [%[m], #156]\n\t" + "LDR r12, [%[a], #156]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #156]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r9, [%[m], #160]\n\t" + "LDR r12, [%[a], #160]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #160]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r9, [%[m], #164]\n\t" + "LDR r12, [%[a], #164]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #164]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r9, [%[m], #168]\n\t" + "LDR r12, [%[a], #168]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #168]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r9, [%[m], #172]\n\t" + "LDR r12, [%[a], #172]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #172]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r9, [%[m], #176]\n\t" + "LDR r12, [%[a], #176]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #176]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r9, [%[m], #180]\n\t" + "LDR r12, [%[a], #180]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #180]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r9, [%[m], #184]\n\t" + "LDR r12, [%[a], #184]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #184]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r9, [%[m], #188]\n\t" + "LDR r12, [%[a], #188]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #188]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r9, [%[m], #192]\n\t" + "LDR r12, [%[a], #192]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #192]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r9, [%[m], #196]\n\t" + "LDR r12, [%[a], #196]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #196]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r9, [%[m], #200]\n\t" + "LDR r12, [%[a], #200]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #200]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r9, [%[m], #204]\n\t" + "LDR r12, [%[a], #204]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #204]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r9, [%[m], #208]\n\t" + "LDR r12, [%[a], #208]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #208]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r9, [%[m], #212]\n\t" + "LDR r12, [%[a], #212]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #212]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r9, [%[m], #216]\n\t" + "LDR r12, [%[a], #216]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #216]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r9, [%[m], #220]\n\t" + "LDR r12, [%[a], #220]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #220]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r9, [%[m], #224]\n\t" + "LDR r12, [%[a], #224]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #224]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r9, [%[m], #228]\n\t" + "LDR r12, [%[a], #228]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #228]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r9, [%[m], #232]\n\t" + "LDR r12, [%[a], #232]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #232]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r9, [%[m], #236]\n\t" + "LDR r12, [%[a], #236]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #236]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r9, [%[m], #240]\n\t" + "LDR r12, [%[a], #240]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #240]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r9, [%[m], #244]\n\t" + "LDR r12, [%[a], #244]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #244]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+62] += m[62] * mu */ + "LDR r9, [%[m], #248]\n\t" + "LDR r12, [%[a], #248]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #248]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+63] += m[63] * mu */ + "LDR r9, [%[m], #252]\n\t" + "LDR r12, [%[a], #252]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #252]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+64] += m[64] * mu */ + "LDR r9, [%[m], #256]\n\t" + "LDR r12, [%[a], #256]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #256]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+65] += m[65] * mu */ + "LDR r9, [%[m], #260]\n\t" + "LDR r12, [%[a], #260]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #260]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+66] += m[66] * mu */ + "LDR r9, [%[m], #264]\n\t" + "LDR r12, [%[a], #264]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #264]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+67] += m[67] * mu */ + "LDR r9, [%[m], #268]\n\t" + "LDR r12, [%[a], #268]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #268]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+68] += m[68] * mu */ + "LDR r9, [%[m], #272]\n\t" + "LDR r12, [%[a], #272]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #272]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+69] += m[69] * mu */ + "LDR r9, [%[m], #276]\n\t" + "LDR r12, [%[a], #276]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #276]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+70] += m[70] * mu */ + "LDR r9, [%[m], #280]\n\t" + "LDR r12, [%[a], #280]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #280]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+71] += m[71] * mu */ + "LDR r9, [%[m], #284]\n\t" + "LDR r12, [%[a], #284]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #284]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+72] += m[72] * mu */ + "LDR r9, [%[m], #288]\n\t" + "LDR r12, [%[a], #288]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #288]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+73] += m[73] * mu */ + "LDR r9, [%[m], #292]\n\t" + "LDR r12, [%[a], #292]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #292]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+74] += m[74] * mu */ + "LDR r9, [%[m], #296]\n\t" + "LDR r12, [%[a], #296]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #296]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+75] += m[75] * mu */ + "LDR r9, [%[m], #300]\n\t" + "LDR r12, [%[a], #300]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #300]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+76] += m[76] * mu */ + "LDR r9, [%[m], #304]\n\t" + "LDR r12, [%[a], #304]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #304]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+77] += m[77] * mu */ + "LDR r9, [%[m], #308]\n\t" + "LDR r12, [%[a], #308]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #308]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+78] += m[78] * mu */ + "LDR r9, [%[m], #312]\n\t" + "LDR r12, [%[a], #312]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #312]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+79] += m[79] * mu */ + "LDR r9, [%[m], #316]\n\t" + "LDR r12, [%[a], #316]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #316]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+80] += m[80] * mu */ + "LDR r9, [%[m], #320]\n\t" + "LDR r12, [%[a], #320]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #320]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+81] += m[81] * mu */ + "LDR r9, [%[m], #324]\n\t" + "LDR r12, [%[a], #324]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #324]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+82] += m[82] * mu */ + "LDR r9, [%[m], #328]\n\t" + "LDR r12, [%[a], #328]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #328]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+83] += m[83] * mu */ + "LDR r9, [%[m], #332]\n\t" + "LDR r12, [%[a], #332]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #332]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+84] += m[84] * mu */ + "LDR r9, [%[m], #336]\n\t" + "LDR r12, [%[a], #336]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #336]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+85] += m[85] * mu */ + "LDR r9, [%[m], #340]\n\t" + "LDR r12, [%[a], #340]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #340]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+86] += m[86] * mu */ + "LDR r9, [%[m], #344]\n\t" + "LDR r12, [%[a], #344]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #344]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+87] += m[87] * mu */ + "LDR r9, [%[m], #348]\n\t" + "LDR r12, [%[a], #348]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #348]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+88] += m[88] * mu */ + "LDR r9, [%[m], #352]\n\t" + "LDR r12, [%[a], #352]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #352]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+89] += m[89] * mu */ + "LDR r9, [%[m], #356]\n\t" + "LDR r12, [%[a], #356]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #356]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+90] += m[90] * mu */ + "LDR r9, [%[m], #360]\n\t" + "LDR r12, [%[a], #360]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #360]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+91] += m[91] * mu */ + "LDR r9, [%[m], #364]\n\t" + "LDR r12, [%[a], #364]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #364]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+92] += m[92] * mu */ + "LDR r9, [%[m], #368]\n\t" + "LDR r12, [%[a], #368]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #368]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+93] += m[93] * mu */ + "LDR r9, [%[m], #372]\n\t" + "LDR r12, [%[a], #372]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #372]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+94] += m[94] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #376]\n\t" + "LDR r12, [%[a], #376]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #376]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+95] += m[95] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[95] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[95] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #376\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #380]\n\t" + "LDR r12, [%[a], #380]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #380]\n\t" + "LDR r12, [%[a], #384]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #384]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_3072_mont_reduce_96_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - ca); + sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 3072 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_3072_mont_reduce_96(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_3072_mont_reduce_96_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r12, [%[m], #128]\n\t" + "LDR r11, [%[a], #128]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r12, [%[m], #132]\n\t" + "LDR r11, [%[a], #132]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r12, [%[m], #136]\n\t" + "LDR r11, [%[a], #136]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r12, [%[m], #140]\n\t" + "LDR r11, [%[a], #140]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r12, [%[m], #144]\n\t" + "LDR r11, [%[a], #144]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r12, [%[m], #148]\n\t" + "LDR r11, [%[a], #148]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r12, [%[m], #152]\n\t" + "LDR r11, [%[a], #152]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r12, [%[m], #156]\n\t" + "LDR r11, [%[a], #156]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r12, [%[m], #160]\n\t" + "LDR r11, [%[a], #160]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r12, [%[m], #164]\n\t" + "LDR r11, [%[a], #164]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r12, [%[m], #168]\n\t" + "LDR r11, [%[a], #168]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r12, [%[m], #172]\n\t" + "LDR r11, [%[a], #172]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r12, [%[m], #176]\n\t" + "LDR r11, [%[a], #176]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r12, [%[m], #180]\n\t" + "LDR r11, [%[a], #180]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r12, [%[m], #184]\n\t" + "LDR r11, [%[a], #184]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r12, [%[m], #188]\n\t" + "LDR r11, [%[a], #188]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r12, [%[m], #192]\n\t" + "LDR r11, [%[a], #192]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r12, [%[m], #196]\n\t" + "LDR r11, [%[a], #196]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r12, [%[m], #200]\n\t" + "LDR r11, [%[a], #200]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r12, [%[m], #204]\n\t" + "LDR r11, [%[a], #204]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r12, [%[m], #208]\n\t" + "LDR r11, [%[a], #208]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r12, [%[m], #212]\n\t" + "LDR r11, [%[a], #212]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r12, [%[m], #216]\n\t" + "LDR r11, [%[a], #216]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r12, [%[m], #220]\n\t" + "LDR r11, [%[a], #220]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r12, [%[m], #224]\n\t" + "LDR r11, [%[a], #224]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r12, [%[m], #228]\n\t" + "LDR r11, [%[a], #228]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r12, [%[m], #232]\n\t" + "LDR r11, [%[a], #232]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r12, [%[m], #236]\n\t" + "LDR r11, [%[a], #236]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r12, [%[m], #240]\n\t" + "LDR r11, [%[a], #240]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r12, [%[m], #244]\n\t" + "LDR r11, [%[a], #244]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "LDR r12, [%[m], #248]\n\t" + "LDR r11, [%[a], #248]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "LDR r12, [%[m], #252]\n\t" + "LDR r11, [%[a], #252]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #252]\n\t" + /* a[i+64] += m[64] * mu */ + "LDR r12, [%[m], #256]\n\t" + "LDR r11, [%[a], #256]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #256]\n\t" + /* a[i+65] += m[65] * mu */ + "LDR r12, [%[m], #260]\n\t" + "LDR r11, [%[a], #260]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #260]\n\t" + /* a[i+66] += m[66] * mu */ + "LDR r12, [%[m], #264]\n\t" + "LDR r11, [%[a], #264]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #264]\n\t" + /* a[i+67] += m[67] * mu */ + "LDR r12, [%[m], #268]\n\t" + "LDR r11, [%[a], #268]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #268]\n\t" + /* a[i+68] += m[68] * mu */ + "LDR r12, [%[m], #272]\n\t" + "LDR r11, [%[a], #272]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #272]\n\t" + /* a[i+69] += m[69] * mu */ + "LDR r12, [%[m], #276]\n\t" + "LDR r11, [%[a], #276]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #276]\n\t" + /* a[i+70] += m[70] * mu */ + "LDR r12, [%[m], #280]\n\t" + "LDR r11, [%[a], #280]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #280]\n\t" + /* a[i+71] += m[71] * mu */ + "LDR r12, [%[m], #284]\n\t" + "LDR r11, [%[a], #284]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #284]\n\t" + /* a[i+72] += m[72] * mu */ + "LDR r12, [%[m], #288]\n\t" + "LDR r11, [%[a], #288]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #288]\n\t" + /* a[i+73] += m[73] * mu */ + "LDR r12, [%[m], #292]\n\t" + "LDR r11, [%[a], #292]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #292]\n\t" + /* a[i+74] += m[74] * mu */ + "LDR r12, [%[m], #296]\n\t" + "LDR r11, [%[a], #296]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #296]\n\t" + /* a[i+75] += m[75] * mu */ + "LDR r12, [%[m], #300]\n\t" + "LDR r11, [%[a], #300]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #300]\n\t" + /* a[i+76] += m[76] * mu */ + "LDR r12, [%[m], #304]\n\t" + "LDR r11, [%[a], #304]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #304]\n\t" + /* a[i+77] += m[77] * mu */ + "LDR r12, [%[m], #308]\n\t" + "LDR r11, [%[a], #308]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #308]\n\t" + /* a[i+78] += m[78] * mu */ + "LDR r12, [%[m], #312]\n\t" + "LDR r11, [%[a], #312]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #312]\n\t" + /* a[i+79] += m[79] * mu */ + "LDR r12, [%[m], #316]\n\t" + "LDR r11, [%[a], #316]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #316]\n\t" + /* a[i+80] += m[80] * mu */ + "LDR r12, [%[m], #320]\n\t" + "LDR r11, [%[a], #320]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #320]\n\t" + /* a[i+81] += m[81] * mu */ + "LDR r12, [%[m], #324]\n\t" + "LDR r11, [%[a], #324]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #324]\n\t" + /* a[i+82] += m[82] * mu */ + "LDR r12, [%[m], #328]\n\t" + "LDR r11, [%[a], #328]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #328]\n\t" + /* a[i+83] += m[83] * mu */ + "LDR r12, [%[m], #332]\n\t" + "LDR r11, [%[a], #332]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #332]\n\t" + /* a[i+84] += m[84] * mu */ + "LDR r12, [%[m], #336]\n\t" + "LDR r11, [%[a], #336]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #336]\n\t" + /* a[i+85] += m[85] * mu */ + "LDR r12, [%[m], #340]\n\t" + "LDR r11, [%[a], #340]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #340]\n\t" + /* a[i+86] += m[86] * mu */ + "LDR r12, [%[m], #344]\n\t" + "LDR r11, [%[a], #344]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #344]\n\t" + /* a[i+87] += m[87] * mu */ + "LDR r12, [%[m], #348]\n\t" + "LDR r11, [%[a], #348]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #348]\n\t" + /* a[i+88] += m[88] * mu */ + "LDR r12, [%[m], #352]\n\t" + "LDR r11, [%[a], #352]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #352]\n\t" + /* a[i+89] += m[89] * mu */ + "LDR r12, [%[m], #356]\n\t" + "LDR r11, [%[a], #356]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #356]\n\t" + /* a[i+90] += m[90] * mu */ + "LDR r12, [%[m], #360]\n\t" + "LDR r11, [%[a], #360]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #360]\n\t" + /* a[i+91] += m[91] * mu */ + "LDR r12, [%[m], #364]\n\t" + "LDR r11, [%[a], #364]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #364]\n\t" + /* a[i+92] += m[92] * mu */ + "LDR r12, [%[m], #368]\n\t" + "LDR r11, [%[a], #368]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #368]\n\t" + /* a[i+93] += m[93] * mu */ + "LDR r12, [%[m], #372]\n\t" + "LDR r11, [%[a], #372]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #372]\n\t" + /* a[i+94] += m[94] * mu */ + "LDR r12, [%[m], #376]\n\t" + "LDR r11, [%[a], #376]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #376]\n\t" + /* a[i+95] += m[95] * mu */ + "LDR r12, [%[m], #380]\n\t" + "LDR r11, [%[a], #380]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #384]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #380]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #384]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x180\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" +#else + "BLT.N L_sp_3072_mont_reduce_96_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_3072_cond_sub_96(a - 96, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -9468,40 +17453,38 @@ SP_NOINLINE static void sp_3072_mont_sqr_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r5, #1\n\t" - "lsl r5, r5, #8\n\t" - "add r5, r5, #128\n\t" - "add r6, r6, r5\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x180\n\t" + "\n" + "L_sp_3072_sub_96_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_3072_sub_96_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_3072_sub_96_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -9511,262 +17494,191 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_3072_sub_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -9776,49 +17688,122 @@ SP_NOINLINE static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_3072_word_96_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_3072_word_96_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -9891,6 +17876,7 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, const sp_digit* a, const s } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -9928,46 +17914,1099 @@ static void sp_3072_mask_96(sp_digit* r, const sp_digit* a, sp_digit m) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_3072_cmp_96(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #124\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x17c\n\t" + "\n" + "L_sp_3072_cmp_96_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_3072_cmp_96_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #380]\n\t" + "LDR r5, [%[b], #380]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #376]\n\t" + "LDR r5, [%[b], #376]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #372]\n\t" + "LDR r5, [%[b], #372]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #368]\n\t" + "LDR r5, [%[b], #368]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #364]\n\t" + "LDR r5, [%[b], #364]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #360]\n\t" + "LDR r5, [%[b], #360]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #356]\n\t" + "LDR r5, [%[b], #356]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #352]\n\t" + "LDR r5, [%[b], #352]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #348]\n\t" + "LDR r5, [%[b], #348]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #344]\n\t" + "LDR r5, [%[b], #344]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #340]\n\t" + "LDR r5, [%[b], #340]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #336]\n\t" + "LDR r5, [%[b], #336]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #332]\n\t" + "LDR r5, [%[b], #332]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #328]\n\t" + "LDR r5, [%[b], #328]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #324]\n\t" + "LDR r5, [%[b], #324]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #320]\n\t" + "LDR r5, [%[b], #320]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #316]\n\t" + "LDR r5, [%[b], #316]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #312]\n\t" + "LDR r5, [%[b], #312]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #308]\n\t" + "LDR r5, [%[b], #308]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #304]\n\t" + "LDR r5, [%[b], #304]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #300]\n\t" + "LDR r5, [%[b], #300]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #296]\n\t" + "LDR r5, [%[b], #296]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #292]\n\t" + "LDR r5, [%[b], #292]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #288]\n\t" + "LDR r5, [%[b], #288]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #284]\n\t" + "LDR r5, [%[b], #284]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #280]\n\t" + "LDR r5, [%[b], #280]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #276]\n\t" + "LDR r5, [%[b], #276]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #272]\n\t" + "LDR r5, [%[b], #272]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #268]\n\t" + "LDR r5, [%[b], #268]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #264]\n\t" + "LDR r5, [%[b], #264]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #260]\n\t" + "LDR r5, [%[b], #260]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #256]\n\t" + "LDR r5, [%[b], #256]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #252]\n\t" + "LDR r5, [%[b], #252]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #248]\n\t" + "LDR r5, [%[b], #248]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #244]\n\t" + "LDR r5, [%[b], #244]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #240]\n\t" + "LDR r5, [%[b], #240]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #236]\n\t" + "LDR r5, [%[b], #236]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #232]\n\t" + "LDR r5, [%[b], #232]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #228]\n\t" + "LDR r5, [%[b], #228]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #224]\n\t" + "LDR r5, [%[b], #224]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #220]\n\t" + "LDR r5, [%[b], #220]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #216]\n\t" + "LDR r5, [%[b], #216]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #212]\n\t" + "LDR r5, [%[b], #212]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #208]\n\t" + "LDR r5, [%[b], #208]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #204]\n\t" + "LDR r5, [%[b], #204]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #200]\n\t" + "LDR r5, [%[b], #200]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #196]\n\t" + "LDR r5, [%[b], #196]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #192]\n\t" + "LDR r5, [%[b], #192]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #188]\n\t" + "LDR r5, [%[b], #188]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #184]\n\t" + "LDR r5, [%[b], #184]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #180]\n\t" + "LDR r5, [%[b], #180]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #176]\n\t" + "LDR r5, [%[b], #176]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #172]\n\t" + "LDR r5, [%[b], #172]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #168]\n\t" + "LDR r5, [%[b], #168]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #164]\n\t" + "LDR r5, [%[b], #164]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #160]\n\t" + "LDR r5, [%[b], #160]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #156]\n\t" + "LDR r5, [%[b], #156]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #152]\n\t" + "LDR r5, [%[b], #152]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #148]\n\t" + "LDR r5, [%[b], #148]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #144]\n\t" + "LDR r5, [%[b], #144]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #140]\n\t" + "LDR r5, [%[b], #140]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #136]\n\t" + "LDR r5, [%[b], #136]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #132]\n\t" + "LDR r5, [%[b], #132]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #128]\n\t" + "LDR r5, [%[b], #128]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -10025,6 +19064,7 @@ static WC_INLINE int sp_3072_mod_96(sp_digit* r, const sp_digit* a, const sp_dig return sp_3072_div_96(a, m, NULL, r); } +#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -10488,6 +19528,7 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -10496,39 +19537,236 @@ int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em, * b A single precision number to add. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #192\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r5, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "\n" + "L_sp_3072_cond_add_48_words_%=:\n\t" + "ADDS r5, r5, #0x-1\n\t" + "LDR r6, [%[a], r4]\n\t" + "LDR r7, [%[b], r4]\n\t" + "AND r7, r7, %[m]\n\t" + "ADCS r6, r6, r7\n\t" + "ADC r5, r8, r8\n\t" + "STR r6, [%[r], r4]\n\t" + "ADD r4, r4, #0x4\n\t" + "CMP r4, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_3072_cond_add_48_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_3072_cond_add_48_words_%=\n\t" +#endif + "MOV %[r], r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_3072_cond_add_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "ADC %[r], r10, r10\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * * in Array of bytes representing the number to exponentiate, base. @@ -10843,602 +20081,593 @@ int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef WOLFSSL_HAVE_SP_DH #ifdef HAVE_FFDHE_3072 -static void sp_3072_lshift_96(sp_digit* r, const sp_digit* a, byte n) +static void sp_3072_lshift_96(sp_digit* r_p, const sp_digit* a_p, byte n_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; + __asm__ __volatile__ ( - "mov r6, #31\n\t" - "sub r6, r6, %[n]\n\t" - "add %[a], %[a], #320\n\t" - "add %[r], %[r], #320\n\t" - "ldr r3, [%[a], #60]\n\t" - "lsr r4, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r4, r4, r6\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r4, [%[a], #60]\n\t" - "str r3, [%[r], #68]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #56]\n\t" - "str r2, [%[r], #64]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #52]\n\t" - "str r4, [%[r], #60]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #48]\n\t" - "str r3, [%[r], #56]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #44]\n\t" - "str r2, [%[r], #52]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #40]\n\t" - "str r4, [%[r], #48]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #36]\n\t" - "str r3, [%[r], #44]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #32]\n\t" - "str r2, [%[r], #40]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #28]\n\t" - "str r4, [%[r], #36]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #24]\n\t" - "str r3, [%[r], #32]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #20]\n\t" - "str r2, [%[r], #28]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #16]\n\t" - "str r4, [%[r], #24]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #12]\n\t" - "str r3, [%[r], #20]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #8]\n\t" - "str r2, [%[r], #16]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #4]\n\t" - "str r4, [%[r], #12]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #0]\n\t" - "str r3, [%[r], #8]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r2, [%[r], #68]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r4, [%[a], #60]\n\t" - "str r3, [%[r], #68]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #56]\n\t" - "str r2, [%[r], #64]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #52]\n\t" - "str r4, [%[r], #60]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #48]\n\t" - "str r3, [%[r], #56]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #44]\n\t" - "str r2, [%[r], #52]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #40]\n\t" - "str r4, [%[r], #48]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #36]\n\t" - "str r3, [%[r], #44]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #32]\n\t" - "str r2, [%[r], #40]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #28]\n\t" - "str r4, [%[r], #36]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #24]\n\t" - "str r3, [%[r], #32]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #20]\n\t" - "str r2, [%[r], #28]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #16]\n\t" - "str r4, [%[r], #24]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #12]\n\t" - "str r3, [%[r], #20]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #8]\n\t" - "str r2, [%[r], #16]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #4]\n\t" - "str r4, [%[r], #12]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #0]\n\t" - "str r3, [%[r], #8]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "str r4, [%[r]]\n\t" - "str r2, [%[r], #4]\n\t" + "RSB r7, %[n], #0x1f\n\t" + "LDR r5, [%[a], #380]\n\t" + "LSR r6, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r6, r6, r7\n\t" + "LDR r4, [%[a], #376]\n\t" + "STR r6, [%[r], #384]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #372]\n\t" + "STR r5, [%[r], #380]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #368]\n\t" + "STR r4, [%[r], #376]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #364]\n\t" + "STR r6, [%[r], #372]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #360]\n\t" + "STR r5, [%[r], #368]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #356]\n\t" + "STR r4, [%[r], #364]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #352]\n\t" + "STR r6, [%[r], #360]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #348]\n\t" + "STR r5, [%[r], #356]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #344]\n\t" + "STR r4, [%[r], #352]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #340]\n\t" + "STR r6, [%[r], #348]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #336]\n\t" + "STR r5, [%[r], #344]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #332]\n\t" + "STR r4, [%[r], #340]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #328]\n\t" + "STR r6, [%[r], #336]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #324]\n\t" + "STR r5, [%[r], #332]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #320]\n\t" + "STR r4, [%[r], #328]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #316]\n\t" + "STR r6, [%[r], #324]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #312]\n\t" + "STR r5, [%[r], #320]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #308]\n\t" + "STR r4, [%[r], #316]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #304]\n\t" + "STR r6, [%[r], #312]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #300]\n\t" + "STR r5, [%[r], #308]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #296]\n\t" + "STR r4, [%[r], #304]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #292]\n\t" + "STR r6, [%[r], #300]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #288]\n\t" + "STR r5, [%[r], #296]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #284]\n\t" + "STR r4, [%[r], #292]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #280]\n\t" + "STR r6, [%[r], #288]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #276]\n\t" + "STR r5, [%[r], #284]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #272]\n\t" + "STR r4, [%[r], #280]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #268]\n\t" + "STR r6, [%[r], #276]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #264]\n\t" + "STR r5, [%[r], #272]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #260]\n\t" + "STR r4, [%[r], #268]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #256]\n\t" + "STR r6, [%[r], #264]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #252]\n\t" + "STR r5, [%[r], #260]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #248]\n\t" + "STR r4, [%[r], #256]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #244]\n\t" + "STR r6, [%[r], #252]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #240]\n\t" + "STR r5, [%[r], #248]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #236]\n\t" + "STR r4, [%[r], #244]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #232]\n\t" + "STR r6, [%[r], #240]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #228]\n\t" + "STR r5, [%[r], #236]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #224]\n\t" + "STR r4, [%[r], #232]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #220]\n\t" + "STR r6, [%[r], #228]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #216]\n\t" + "STR r5, [%[r], #224]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #212]\n\t" + "STR r4, [%[r], #220]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #208]\n\t" + "STR r6, [%[r], #216]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #204]\n\t" + "STR r5, [%[r], #212]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #200]\n\t" + "STR r4, [%[r], #208]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #196]\n\t" + "STR r6, [%[r], #204]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #192]\n\t" + "STR r5, [%[r], #200]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #188]\n\t" + "STR r4, [%[r], #196]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #184]\n\t" + "STR r6, [%[r], #192]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #180]\n\t" + "STR r5, [%[r], #188]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #176]\n\t" + "STR r4, [%[r], #184]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #172]\n\t" + "STR r6, [%[r], #180]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #168]\n\t" + "STR r5, [%[r], #176]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #164]\n\t" + "STR r4, [%[r], #172]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #160]\n\t" + "STR r6, [%[r], #168]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #156]\n\t" + "STR r5, [%[r], #164]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #152]\n\t" + "STR r4, [%[r], #160]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #148]\n\t" + "STR r6, [%[r], #156]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #144]\n\t" + "STR r5, [%[r], #152]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #140]\n\t" + "STR r4, [%[r], #148]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #136]\n\t" + "STR r6, [%[r], #144]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #132]\n\t" + "STR r5, [%[r], #140]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #128]\n\t" + "STR r4, [%[r], #136]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #124]\n\t" + "STR r6, [%[r], #132]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #120]\n\t" + "STR r5, [%[r], #128]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #116]\n\t" + "STR r4, [%[r], #124]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #112]\n\t" + "STR r6, [%[r], #120]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #108]\n\t" + "STR r5, [%[r], #116]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #104]\n\t" + "STR r4, [%[r], #112]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #100]\n\t" + "STR r6, [%[r], #108]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #96]\n\t" + "STR r5, [%[r], #104]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #92]\n\t" + "STR r4, [%[r], #100]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #88]\n\t" + "STR r6, [%[r], #96]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #84]\n\t" + "STR r5, [%[r], #92]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #80]\n\t" + "STR r4, [%[r], #88]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #76]\n\t" + "STR r6, [%[r], #84]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #72]\n\t" + "STR r5, [%[r], #80]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #68]\n\t" + "STR r4, [%[r], #76]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #64]\n\t" + "STR r6, [%[r], #72]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #60]\n\t" + "STR r5, [%[r], #68]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #56]\n\t" + "STR r4, [%[r], #64]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #52]\n\t" + "STR r6, [%[r], #60]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #48]\n\t" + "STR r5, [%[r], #56]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #44]\n\t" + "STR r4, [%[r], #52]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #40]\n\t" + "STR r6, [%[r], #48]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #36]\n\t" + "STR r5, [%[r], #44]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #32]\n\t" + "STR r4, [%[r], #40]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #28]\n\t" + "STR r6, [%[r], #36]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #24]\n\t" + "STR r5, [%[r], #32]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #20]\n\t" + "STR r4, [%[r], #28]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #16]\n\t" + "STR r6, [%[r], #24]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #12]\n\t" + "STR r5, [%[r], #20]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #8]\n\t" + "STR r4, [%[r], #16]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #4]\n\t" + "STR r6, [%[r], #12]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a]]\n\t" + "STR r5, [%[r], #8]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "STR r6, [%[r]]\n\t" + "STR r4, [%[r], #4]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" + : "memory", "r4", "r5", "r6", "r3", "r7" ); } @@ -11740,14 +20969,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -11852,345 +21081,247 @@ static void sp_4096_to_bin_128(sp_digit* r, byte* a) #define sp_4096_norm_128(a) #ifndef WOLFSSL_SP_SMALL -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -12199,340 +21330,244 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Multiply a and b into r. (r = a * b) @@ -12618,39 +21653,39 @@ SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_add_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #512\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x200\n\t" + "\n" + "L_sp_4096_add_128_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_4096_add_128_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_4096_add_128_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -12660,39 +21695,37 @@ SP_NOINLINE static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_sub_in_place_128(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #512\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x200\n\t" + "\n" + "L_sp_4096_sub_in_pkace_128_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_4096_sub_in_pkace_128_word_%=\n\t" +#else + "BNE.N L_sp_4096_sub_in_pkace_128_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ @@ -12703,88 +21736,74 @@ SP_NOINLINE static sp_digit sp_4096_sub_in_place_128(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[128 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #252\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #3\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x400\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_4096_mul_128_outer_%=:\n\t" + "SUBS r3, r5, #0x1fc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_4096_mul_128_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x200\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_4096_mul_128_inner_done_%=\n\t" +#else + "BEQ.N L_sp_4096_mul_128_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_4096_mul_128_inner_%=\n\t" +#else + "BLE.N L_sp_4096_mul_128_inner_%=\n\t" +#endif + "\n" + "L_sp_4096_mul_128_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x3f8\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_4096_mul_128_outer_%=\n\t" +#else + "BLE.N L_sp_4096_mul_128_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_4096_mul_128_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_4096_mul_128_store_%=\n\t" +#else + "BGT.N L_sp_4096_mul_128_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -12792,131 +21811,97 @@ SP_NOINLINE static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) +static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #4\n\t" - "lsl r6, r6, #8\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #252\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x400\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_4096_sqr_128_outer_%=:\n\t" + "SUBS r3, r5, #0x1fc\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_4096_sqr_128_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_4096_sqr_128_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_4096_sqr_128_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_4096_sqr_128_op_done_%=\n\t" + "\n" + "L_sp_4096_sqr_128_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_4096_sqr_128_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_4096_sqr_128_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #2\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_4096_sqr_128_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_4096_sqr_128_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_4096_sqr_128_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_4096_sqr_128_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_4096_sqr_128_inner_%=\n\t" +#endif + "\n" + "L_sp_4096_sqr_128_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x3f8\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_4096_sqr_128_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #3\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #248\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_4096_sqr_128_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_4096_sqr_128_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_4096_sqr_128_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #3\n\t" - "lsl r3, r3, #8\n\t" - "add r3, r3, #252\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #4\n\t" - "lsl r6, r6, #8\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_4096_sqr_128_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -12941,48 +21926,714 @@ static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho) *rho = (sp_digit)0 - x; } +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #512\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_4096_mul_d_128_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_4096_mul_d_128_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_4096_mul_d_128_word_%=\n\t" +#endif + "STR r3, [%[r], #512]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_4096_mul_d_128(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[32] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[33] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[34] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[35] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[36] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[37] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[38] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[39] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[40] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[41] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[42] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[43] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[44] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[45] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[46] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[47] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[48] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[49] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[50] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[51] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[52] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[53] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[54] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[55] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[56] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[57] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[58] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[59] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[60] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[61] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[62] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[63] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[64] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[65] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[66] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[67] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[68] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[69] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[70] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[71] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[72] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[73] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[74] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[75] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[76] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[77] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[78] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[79] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[80] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[81] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[82] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[83] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[84] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[85] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[86] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[87] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[88] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[89] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[90] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[91] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[92] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[93] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[94] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[95] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[96] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[97] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[98] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[99] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[100] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[101] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[102] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[103] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[104] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[105] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[106] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[107] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[108] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[109] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[110] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[111] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[112] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[113] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[114] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[115] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[116] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[117] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[118] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[119] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[120] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[121] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[122] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[123] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[124] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[125] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[126] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[127] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "STR r5, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 4096 bits, just need to subtract. @@ -12999,6 +22650,7 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) } #endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */ +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -13007,142 +22659,2273 @@ static void sp_4096_mont_norm_128(sp_digit* r, const sp_digit* m) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #2\n\t" - "lsl r5, r5, #8\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_4096_cond_sub_128_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_4096_cond_sub_128_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_4096_cond_sub_128_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_4096_cond_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 4096 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #512\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_4096_mont_reduce_128_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #504\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r9, [%[m], #128]\n\t" + "LDR r12, [%[a], #128]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r9, [%[m], #132]\n\t" + "LDR r12, [%[a], #132]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #132]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r9, [%[m], #136]\n\t" + "LDR r12, [%[a], #136]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #136]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r9, [%[m], #140]\n\t" + "LDR r12, [%[a], #140]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #140]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r9, [%[m], #144]\n\t" + "LDR r12, [%[a], #144]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #144]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r9, [%[m], #148]\n\t" + "LDR r12, [%[a], #148]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #148]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r9, [%[m], #152]\n\t" + "LDR r12, [%[a], #152]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #152]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r9, [%[m], #156]\n\t" + "LDR r12, [%[a], #156]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #156]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r9, [%[m], #160]\n\t" + "LDR r12, [%[a], #160]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #160]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r9, [%[m], #164]\n\t" + "LDR r12, [%[a], #164]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #164]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r9, [%[m], #168]\n\t" + "LDR r12, [%[a], #168]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #168]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r9, [%[m], #172]\n\t" + "LDR r12, [%[a], #172]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #172]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r9, [%[m], #176]\n\t" + "LDR r12, [%[a], #176]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #176]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r9, [%[m], #180]\n\t" + "LDR r12, [%[a], #180]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #180]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r9, [%[m], #184]\n\t" + "LDR r12, [%[a], #184]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #184]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r9, [%[m], #188]\n\t" + "LDR r12, [%[a], #188]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #188]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r9, [%[m], #192]\n\t" + "LDR r12, [%[a], #192]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #192]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r9, [%[m], #196]\n\t" + "LDR r12, [%[a], #196]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #196]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r9, [%[m], #200]\n\t" + "LDR r12, [%[a], #200]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #200]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r9, [%[m], #204]\n\t" + "LDR r12, [%[a], #204]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #204]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r9, [%[m], #208]\n\t" + "LDR r12, [%[a], #208]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #208]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r9, [%[m], #212]\n\t" + "LDR r12, [%[a], #212]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #212]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r9, [%[m], #216]\n\t" + "LDR r12, [%[a], #216]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #216]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r9, [%[m], #220]\n\t" + "LDR r12, [%[a], #220]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #220]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r9, [%[m], #224]\n\t" + "LDR r12, [%[a], #224]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #224]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r9, [%[m], #228]\n\t" + "LDR r12, [%[a], #228]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #228]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r9, [%[m], #232]\n\t" + "LDR r12, [%[a], #232]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #232]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r9, [%[m], #236]\n\t" + "LDR r12, [%[a], #236]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #236]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r9, [%[m], #240]\n\t" + "LDR r12, [%[a], #240]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #240]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r9, [%[m], #244]\n\t" + "LDR r12, [%[a], #244]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #244]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+62] += m[62] * mu */ + "LDR r9, [%[m], #248]\n\t" + "LDR r12, [%[a], #248]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #248]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+63] += m[63] * mu */ + "LDR r9, [%[m], #252]\n\t" + "LDR r12, [%[a], #252]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #252]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+64] += m[64] * mu */ + "LDR r9, [%[m], #256]\n\t" + "LDR r12, [%[a], #256]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #256]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+65] += m[65] * mu */ + "LDR r9, [%[m], #260]\n\t" + "LDR r12, [%[a], #260]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #260]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+66] += m[66] * mu */ + "LDR r9, [%[m], #264]\n\t" + "LDR r12, [%[a], #264]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #264]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+67] += m[67] * mu */ + "LDR r9, [%[m], #268]\n\t" + "LDR r12, [%[a], #268]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #268]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+68] += m[68] * mu */ + "LDR r9, [%[m], #272]\n\t" + "LDR r12, [%[a], #272]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #272]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+69] += m[69] * mu */ + "LDR r9, [%[m], #276]\n\t" + "LDR r12, [%[a], #276]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #276]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+70] += m[70] * mu */ + "LDR r9, [%[m], #280]\n\t" + "LDR r12, [%[a], #280]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #280]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+71] += m[71] * mu */ + "LDR r9, [%[m], #284]\n\t" + "LDR r12, [%[a], #284]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #284]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+72] += m[72] * mu */ + "LDR r9, [%[m], #288]\n\t" + "LDR r12, [%[a], #288]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #288]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+73] += m[73] * mu */ + "LDR r9, [%[m], #292]\n\t" + "LDR r12, [%[a], #292]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #292]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+74] += m[74] * mu */ + "LDR r9, [%[m], #296]\n\t" + "LDR r12, [%[a], #296]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #296]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+75] += m[75] * mu */ + "LDR r9, [%[m], #300]\n\t" + "LDR r12, [%[a], #300]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #300]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+76] += m[76] * mu */ + "LDR r9, [%[m], #304]\n\t" + "LDR r12, [%[a], #304]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #304]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+77] += m[77] * mu */ + "LDR r9, [%[m], #308]\n\t" + "LDR r12, [%[a], #308]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #308]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+78] += m[78] * mu */ + "LDR r9, [%[m], #312]\n\t" + "LDR r12, [%[a], #312]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #312]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+79] += m[79] * mu */ + "LDR r9, [%[m], #316]\n\t" + "LDR r12, [%[a], #316]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #316]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+80] += m[80] * mu */ + "LDR r9, [%[m], #320]\n\t" + "LDR r12, [%[a], #320]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #320]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+81] += m[81] * mu */ + "LDR r9, [%[m], #324]\n\t" + "LDR r12, [%[a], #324]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #324]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+82] += m[82] * mu */ + "LDR r9, [%[m], #328]\n\t" + "LDR r12, [%[a], #328]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #328]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+83] += m[83] * mu */ + "LDR r9, [%[m], #332]\n\t" + "LDR r12, [%[a], #332]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #332]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+84] += m[84] * mu */ + "LDR r9, [%[m], #336]\n\t" + "LDR r12, [%[a], #336]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #336]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+85] += m[85] * mu */ + "LDR r9, [%[m], #340]\n\t" + "LDR r12, [%[a], #340]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #340]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+86] += m[86] * mu */ + "LDR r9, [%[m], #344]\n\t" + "LDR r12, [%[a], #344]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #344]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+87] += m[87] * mu */ + "LDR r9, [%[m], #348]\n\t" + "LDR r12, [%[a], #348]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #348]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+88] += m[88] * mu */ + "LDR r9, [%[m], #352]\n\t" + "LDR r12, [%[a], #352]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #352]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+89] += m[89] * mu */ + "LDR r9, [%[m], #356]\n\t" + "LDR r12, [%[a], #356]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #356]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+90] += m[90] * mu */ + "LDR r9, [%[m], #360]\n\t" + "LDR r12, [%[a], #360]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #360]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+91] += m[91] * mu */ + "LDR r9, [%[m], #364]\n\t" + "LDR r12, [%[a], #364]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #364]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+92] += m[92] * mu */ + "LDR r9, [%[m], #368]\n\t" + "LDR r12, [%[a], #368]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #368]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+93] += m[93] * mu */ + "LDR r9, [%[m], #372]\n\t" + "LDR r12, [%[a], #372]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #372]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+94] += m[94] * mu */ + "LDR r9, [%[m], #376]\n\t" + "LDR r12, [%[a], #376]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #376]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+95] += m[95] * mu */ + "LDR r9, [%[m], #380]\n\t" + "LDR r12, [%[a], #380]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #380]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+96] += m[96] * mu */ + "LDR r9, [%[m], #384]\n\t" + "LDR r12, [%[a], #384]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #384]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+97] += m[97] * mu */ + "LDR r9, [%[m], #388]\n\t" + "LDR r12, [%[a], #388]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #388]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+98] += m[98] * mu */ + "LDR r9, [%[m], #392]\n\t" + "LDR r12, [%[a], #392]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #392]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+99] += m[99] * mu */ + "LDR r9, [%[m], #396]\n\t" + "LDR r12, [%[a], #396]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #396]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+100] += m[100] * mu */ + "LDR r9, [%[m], #400]\n\t" + "LDR r12, [%[a], #400]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #400]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+101] += m[101] * mu */ + "LDR r9, [%[m], #404]\n\t" + "LDR r12, [%[a], #404]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #404]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+102] += m[102] * mu */ + "LDR r9, [%[m], #408]\n\t" + "LDR r12, [%[a], #408]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #408]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+103] += m[103] * mu */ + "LDR r9, [%[m], #412]\n\t" + "LDR r12, [%[a], #412]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #412]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+104] += m[104] * mu */ + "LDR r9, [%[m], #416]\n\t" + "LDR r12, [%[a], #416]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #416]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+105] += m[105] * mu */ + "LDR r9, [%[m], #420]\n\t" + "LDR r12, [%[a], #420]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #420]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+106] += m[106] * mu */ + "LDR r9, [%[m], #424]\n\t" + "LDR r12, [%[a], #424]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #424]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+107] += m[107] * mu */ + "LDR r9, [%[m], #428]\n\t" + "LDR r12, [%[a], #428]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #428]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+108] += m[108] * mu */ + "LDR r9, [%[m], #432]\n\t" + "LDR r12, [%[a], #432]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #432]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+109] += m[109] * mu */ + "LDR r9, [%[m], #436]\n\t" + "LDR r12, [%[a], #436]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #436]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+110] += m[110] * mu */ + "LDR r9, [%[m], #440]\n\t" + "LDR r12, [%[a], #440]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #440]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+111] += m[111] * mu */ + "LDR r9, [%[m], #444]\n\t" + "LDR r12, [%[a], #444]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #444]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+112] += m[112] * mu */ + "LDR r9, [%[m], #448]\n\t" + "LDR r12, [%[a], #448]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #448]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+113] += m[113] * mu */ + "LDR r9, [%[m], #452]\n\t" + "LDR r12, [%[a], #452]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #452]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+114] += m[114] * mu */ + "LDR r9, [%[m], #456]\n\t" + "LDR r12, [%[a], #456]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #456]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+115] += m[115] * mu */ + "LDR r9, [%[m], #460]\n\t" + "LDR r12, [%[a], #460]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #460]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+116] += m[116] * mu */ + "LDR r9, [%[m], #464]\n\t" + "LDR r12, [%[a], #464]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #464]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+117] += m[117] * mu */ + "LDR r9, [%[m], #468]\n\t" + "LDR r12, [%[a], #468]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #468]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+118] += m[118] * mu */ + "LDR r9, [%[m], #472]\n\t" + "LDR r12, [%[a], #472]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #472]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+119] += m[119] * mu */ + "LDR r9, [%[m], #476]\n\t" + "LDR r12, [%[a], #476]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #476]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+120] += m[120] * mu */ + "LDR r9, [%[m], #480]\n\t" + "LDR r12, [%[a], #480]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #480]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+121] += m[121] * mu */ + "LDR r9, [%[m], #484]\n\t" + "LDR r12, [%[a], #484]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #484]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+122] += m[122] * mu */ + "LDR r9, [%[m], #488]\n\t" + "LDR r12, [%[a], #488]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #488]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+123] += m[123] * mu */ + "LDR r9, [%[m], #492]\n\t" + "LDR r12, [%[a], #492]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #492]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+124] += m[124] * mu */ + "LDR r9, [%[m], #496]\n\t" + "LDR r12, [%[a], #496]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #496]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+125] += m[125] * mu */ + "LDR r9, [%[m], #500]\n\t" + "LDR r12, [%[a], #500]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #500]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+126] += m[126] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #504]\n\t" + "LDR r12, [%[a], #504]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #504]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+127] += m[127] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[127] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[127] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #504\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #508]\n\t" + "LDR r12, [%[a], #508]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #508]\n\t" + "LDR r12, [%[a], #512]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #512]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_4096_mont_reduce_128_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - ca); + sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 4096 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_4096_mont_reduce_128(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_4096_mont_reduce_128_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #124]\n\t" + /* a[i+32] += m[32] * mu */ + "LDR r12, [%[m], #128]\n\t" + "LDR r11, [%[a], #128]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #128]\n\t" + /* a[i+33] += m[33] * mu */ + "LDR r12, [%[m], #132]\n\t" + "LDR r11, [%[a], #132]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #132]\n\t" + /* a[i+34] += m[34] * mu */ + "LDR r12, [%[m], #136]\n\t" + "LDR r11, [%[a], #136]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #136]\n\t" + /* a[i+35] += m[35] * mu */ + "LDR r12, [%[m], #140]\n\t" + "LDR r11, [%[a], #140]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #140]\n\t" + /* a[i+36] += m[36] * mu */ + "LDR r12, [%[m], #144]\n\t" + "LDR r11, [%[a], #144]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #144]\n\t" + /* a[i+37] += m[37] * mu */ + "LDR r12, [%[m], #148]\n\t" + "LDR r11, [%[a], #148]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #148]\n\t" + /* a[i+38] += m[38] * mu */ + "LDR r12, [%[m], #152]\n\t" + "LDR r11, [%[a], #152]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #152]\n\t" + /* a[i+39] += m[39] * mu */ + "LDR r12, [%[m], #156]\n\t" + "LDR r11, [%[a], #156]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #156]\n\t" + /* a[i+40] += m[40] * mu */ + "LDR r12, [%[m], #160]\n\t" + "LDR r11, [%[a], #160]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #160]\n\t" + /* a[i+41] += m[41] * mu */ + "LDR r12, [%[m], #164]\n\t" + "LDR r11, [%[a], #164]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #164]\n\t" + /* a[i+42] += m[42] * mu */ + "LDR r12, [%[m], #168]\n\t" + "LDR r11, [%[a], #168]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #168]\n\t" + /* a[i+43] += m[43] * mu */ + "LDR r12, [%[m], #172]\n\t" + "LDR r11, [%[a], #172]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #172]\n\t" + /* a[i+44] += m[44] * mu */ + "LDR r12, [%[m], #176]\n\t" + "LDR r11, [%[a], #176]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #176]\n\t" + /* a[i+45] += m[45] * mu */ + "LDR r12, [%[m], #180]\n\t" + "LDR r11, [%[a], #180]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #180]\n\t" + /* a[i+46] += m[46] * mu */ + "LDR r12, [%[m], #184]\n\t" + "LDR r11, [%[a], #184]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #184]\n\t" + /* a[i+47] += m[47] * mu */ + "LDR r12, [%[m], #188]\n\t" + "LDR r11, [%[a], #188]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #188]\n\t" + /* a[i+48] += m[48] * mu */ + "LDR r12, [%[m], #192]\n\t" + "LDR r11, [%[a], #192]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #192]\n\t" + /* a[i+49] += m[49] * mu */ + "LDR r12, [%[m], #196]\n\t" + "LDR r11, [%[a], #196]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #196]\n\t" + /* a[i+50] += m[50] * mu */ + "LDR r12, [%[m], #200]\n\t" + "LDR r11, [%[a], #200]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #200]\n\t" + /* a[i+51] += m[51] * mu */ + "LDR r12, [%[m], #204]\n\t" + "LDR r11, [%[a], #204]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #204]\n\t" + /* a[i+52] += m[52] * mu */ + "LDR r12, [%[m], #208]\n\t" + "LDR r11, [%[a], #208]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #208]\n\t" + /* a[i+53] += m[53] * mu */ + "LDR r12, [%[m], #212]\n\t" + "LDR r11, [%[a], #212]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #212]\n\t" + /* a[i+54] += m[54] * mu */ + "LDR r12, [%[m], #216]\n\t" + "LDR r11, [%[a], #216]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #216]\n\t" + /* a[i+55] += m[55] * mu */ + "LDR r12, [%[m], #220]\n\t" + "LDR r11, [%[a], #220]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #220]\n\t" + /* a[i+56] += m[56] * mu */ + "LDR r12, [%[m], #224]\n\t" + "LDR r11, [%[a], #224]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #224]\n\t" + /* a[i+57] += m[57] * mu */ + "LDR r12, [%[m], #228]\n\t" + "LDR r11, [%[a], #228]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #228]\n\t" + /* a[i+58] += m[58] * mu */ + "LDR r12, [%[m], #232]\n\t" + "LDR r11, [%[a], #232]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #232]\n\t" + /* a[i+59] += m[59] * mu */ + "LDR r12, [%[m], #236]\n\t" + "LDR r11, [%[a], #236]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #236]\n\t" + /* a[i+60] += m[60] * mu */ + "LDR r12, [%[m], #240]\n\t" + "LDR r11, [%[a], #240]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #240]\n\t" + /* a[i+61] += m[61] * mu */ + "LDR r12, [%[m], #244]\n\t" + "LDR r11, [%[a], #244]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #244]\n\t" + /* a[i+62] += m[62] * mu */ + "LDR r12, [%[m], #248]\n\t" + "LDR r11, [%[a], #248]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #248]\n\t" + /* a[i+63] += m[63] * mu */ + "LDR r12, [%[m], #252]\n\t" + "LDR r11, [%[a], #252]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #252]\n\t" + /* a[i+64] += m[64] * mu */ + "LDR r12, [%[m], #256]\n\t" + "LDR r11, [%[a], #256]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #256]\n\t" + /* a[i+65] += m[65] * mu */ + "LDR r12, [%[m], #260]\n\t" + "LDR r11, [%[a], #260]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #260]\n\t" + /* a[i+66] += m[66] * mu */ + "LDR r12, [%[m], #264]\n\t" + "LDR r11, [%[a], #264]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #264]\n\t" + /* a[i+67] += m[67] * mu */ + "LDR r12, [%[m], #268]\n\t" + "LDR r11, [%[a], #268]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #268]\n\t" + /* a[i+68] += m[68] * mu */ + "LDR r12, [%[m], #272]\n\t" + "LDR r11, [%[a], #272]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #272]\n\t" + /* a[i+69] += m[69] * mu */ + "LDR r12, [%[m], #276]\n\t" + "LDR r11, [%[a], #276]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #276]\n\t" + /* a[i+70] += m[70] * mu */ + "LDR r12, [%[m], #280]\n\t" + "LDR r11, [%[a], #280]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #280]\n\t" + /* a[i+71] += m[71] * mu */ + "LDR r12, [%[m], #284]\n\t" + "LDR r11, [%[a], #284]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #284]\n\t" + /* a[i+72] += m[72] * mu */ + "LDR r12, [%[m], #288]\n\t" + "LDR r11, [%[a], #288]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #288]\n\t" + /* a[i+73] += m[73] * mu */ + "LDR r12, [%[m], #292]\n\t" + "LDR r11, [%[a], #292]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #292]\n\t" + /* a[i+74] += m[74] * mu */ + "LDR r12, [%[m], #296]\n\t" + "LDR r11, [%[a], #296]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #296]\n\t" + /* a[i+75] += m[75] * mu */ + "LDR r12, [%[m], #300]\n\t" + "LDR r11, [%[a], #300]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #300]\n\t" + /* a[i+76] += m[76] * mu */ + "LDR r12, [%[m], #304]\n\t" + "LDR r11, [%[a], #304]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #304]\n\t" + /* a[i+77] += m[77] * mu */ + "LDR r12, [%[m], #308]\n\t" + "LDR r11, [%[a], #308]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #308]\n\t" + /* a[i+78] += m[78] * mu */ + "LDR r12, [%[m], #312]\n\t" + "LDR r11, [%[a], #312]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #312]\n\t" + /* a[i+79] += m[79] * mu */ + "LDR r12, [%[m], #316]\n\t" + "LDR r11, [%[a], #316]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #316]\n\t" + /* a[i+80] += m[80] * mu */ + "LDR r12, [%[m], #320]\n\t" + "LDR r11, [%[a], #320]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #320]\n\t" + /* a[i+81] += m[81] * mu */ + "LDR r12, [%[m], #324]\n\t" + "LDR r11, [%[a], #324]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #324]\n\t" + /* a[i+82] += m[82] * mu */ + "LDR r12, [%[m], #328]\n\t" + "LDR r11, [%[a], #328]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #328]\n\t" + /* a[i+83] += m[83] * mu */ + "LDR r12, [%[m], #332]\n\t" + "LDR r11, [%[a], #332]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #332]\n\t" + /* a[i+84] += m[84] * mu */ + "LDR r12, [%[m], #336]\n\t" + "LDR r11, [%[a], #336]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #336]\n\t" + /* a[i+85] += m[85] * mu */ + "LDR r12, [%[m], #340]\n\t" + "LDR r11, [%[a], #340]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #340]\n\t" + /* a[i+86] += m[86] * mu */ + "LDR r12, [%[m], #344]\n\t" + "LDR r11, [%[a], #344]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #344]\n\t" + /* a[i+87] += m[87] * mu */ + "LDR r12, [%[m], #348]\n\t" + "LDR r11, [%[a], #348]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #348]\n\t" + /* a[i+88] += m[88] * mu */ + "LDR r12, [%[m], #352]\n\t" + "LDR r11, [%[a], #352]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #352]\n\t" + /* a[i+89] += m[89] * mu */ + "LDR r12, [%[m], #356]\n\t" + "LDR r11, [%[a], #356]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #356]\n\t" + /* a[i+90] += m[90] * mu */ + "LDR r12, [%[m], #360]\n\t" + "LDR r11, [%[a], #360]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #360]\n\t" + /* a[i+91] += m[91] * mu */ + "LDR r12, [%[m], #364]\n\t" + "LDR r11, [%[a], #364]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #364]\n\t" + /* a[i+92] += m[92] * mu */ + "LDR r12, [%[m], #368]\n\t" + "LDR r11, [%[a], #368]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #368]\n\t" + /* a[i+93] += m[93] * mu */ + "LDR r12, [%[m], #372]\n\t" + "LDR r11, [%[a], #372]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #372]\n\t" + /* a[i+94] += m[94] * mu */ + "LDR r12, [%[m], #376]\n\t" + "LDR r11, [%[a], #376]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #376]\n\t" + /* a[i+95] += m[95] * mu */ + "LDR r12, [%[m], #380]\n\t" + "LDR r11, [%[a], #380]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #380]\n\t" + /* a[i+96] += m[96] * mu */ + "LDR r12, [%[m], #384]\n\t" + "LDR r11, [%[a], #384]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #384]\n\t" + /* a[i+97] += m[97] * mu */ + "LDR r12, [%[m], #388]\n\t" + "LDR r11, [%[a], #388]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #388]\n\t" + /* a[i+98] += m[98] * mu */ + "LDR r12, [%[m], #392]\n\t" + "LDR r11, [%[a], #392]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #392]\n\t" + /* a[i+99] += m[99] * mu */ + "LDR r12, [%[m], #396]\n\t" + "LDR r11, [%[a], #396]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #396]\n\t" + /* a[i+100] += m[100] * mu */ + "LDR r12, [%[m], #400]\n\t" + "LDR r11, [%[a], #400]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #400]\n\t" + /* a[i+101] += m[101] * mu */ + "LDR r12, [%[m], #404]\n\t" + "LDR r11, [%[a], #404]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #404]\n\t" + /* a[i+102] += m[102] * mu */ + "LDR r12, [%[m], #408]\n\t" + "LDR r11, [%[a], #408]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #408]\n\t" + /* a[i+103] += m[103] * mu */ + "LDR r12, [%[m], #412]\n\t" + "LDR r11, [%[a], #412]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #412]\n\t" + /* a[i+104] += m[104] * mu */ + "LDR r12, [%[m], #416]\n\t" + "LDR r11, [%[a], #416]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #416]\n\t" + /* a[i+105] += m[105] * mu */ + "LDR r12, [%[m], #420]\n\t" + "LDR r11, [%[a], #420]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #420]\n\t" + /* a[i+106] += m[106] * mu */ + "LDR r12, [%[m], #424]\n\t" + "LDR r11, [%[a], #424]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #424]\n\t" + /* a[i+107] += m[107] * mu */ + "LDR r12, [%[m], #428]\n\t" + "LDR r11, [%[a], #428]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #428]\n\t" + /* a[i+108] += m[108] * mu */ + "LDR r12, [%[m], #432]\n\t" + "LDR r11, [%[a], #432]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #432]\n\t" + /* a[i+109] += m[109] * mu */ + "LDR r12, [%[m], #436]\n\t" + "LDR r11, [%[a], #436]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #436]\n\t" + /* a[i+110] += m[110] * mu */ + "LDR r12, [%[m], #440]\n\t" + "LDR r11, [%[a], #440]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #440]\n\t" + /* a[i+111] += m[111] * mu */ + "LDR r12, [%[m], #444]\n\t" + "LDR r11, [%[a], #444]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #444]\n\t" + /* a[i+112] += m[112] * mu */ + "LDR r12, [%[m], #448]\n\t" + "LDR r11, [%[a], #448]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #448]\n\t" + /* a[i+113] += m[113] * mu */ + "LDR r12, [%[m], #452]\n\t" + "LDR r11, [%[a], #452]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #452]\n\t" + /* a[i+114] += m[114] * mu */ + "LDR r12, [%[m], #456]\n\t" + "LDR r11, [%[a], #456]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #456]\n\t" + /* a[i+115] += m[115] * mu */ + "LDR r12, [%[m], #460]\n\t" + "LDR r11, [%[a], #460]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #460]\n\t" + /* a[i+116] += m[116] * mu */ + "LDR r12, [%[m], #464]\n\t" + "LDR r11, [%[a], #464]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #464]\n\t" + /* a[i+117] += m[117] * mu */ + "LDR r12, [%[m], #468]\n\t" + "LDR r11, [%[a], #468]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #468]\n\t" + /* a[i+118] += m[118] * mu */ + "LDR r12, [%[m], #472]\n\t" + "LDR r11, [%[a], #472]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #472]\n\t" + /* a[i+119] += m[119] * mu */ + "LDR r12, [%[m], #476]\n\t" + "LDR r11, [%[a], #476]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #476]\n\t" + /* a[i+120] += m[120] * mu */ + "LDR r12, [%[m], #480]\n\t" + "LDR r11, [%[a], #480]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #480]\n\t" + /* a[i+121] += m[121] * mu */ + "LDR r12, [%[m], #484]\n\t" + "LDR r11, [%[a], #484]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #484]\n\t" + /* a[i+122] += m[122] * mu */ + "LDR r12, [%[m], #488]\n\t" + "LDR r11, [%[a], #488]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #488]\n\t" + /* a[i+123] += m[123] * mu */ + "LDR r12, [%[m], #492]\n\t" + "LDR r11, [%[a], #492]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #492]\n\t" + /* a[i+124] += m[124] * mu */ + "LDR r12, [%[m], #496]\n\t" + "LDR r11, [%[a], #496]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #496]\n\t" + /* a[i+125] += m[125] * mu */ + "LDR r12, [%[m], #500]\n\t" + "LDR r11, [%[a], #500]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #500]\n\t" + /* a[i+126] += m[126] * mu */ + "LDR r12, [%[m], #504]\n\t" + "LDR r11, [%[a], #504]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #504]\n\t" + /* a[i+127] += m[127] * mu */ + "LDR r12, [%[m], #508]\n\t" + "LDR r11, [%[a], #508]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #512]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #508]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #512]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x200\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" +#else + "BLT.N L_sp_4096_mont_reduce_128_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_4096_cond_sub_128(a - 128, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -13180,39 +24963,38 @@ SP_NOINLINE static void sp_4096_mont_sqr_128(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r5, #2\n\t" - "lsl r5, r5, #8\n\t" - "add r6, r6, r5\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x200\n\t" + "\n" + "L_sp_4096_sub_128_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_4096_sub_128_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_4096_sub_128_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -13222,342 +25004,247 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_4096_sub_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -13567,49 +25254,122 @@ SP_NOINLINE static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_4096_word_128_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_4096_word_128_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* Divide d in a and put remainder into r (m*d + r = a) * m is not calculated as it is not needed at this time. * @@ -13682,6 +25442,7 @@ static WC_INLINE int sp_4096_mod_128_cond(sp_digit* r, const sp_digit* a, const } #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH) +#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY) /* AND m into each word of a and store in r. * * r A single precision integer. @@ -13719,46 +25480,1451 @@ static void sp_4096_mask_128(sp_digit* r, const sp_digit* a, sp_digit m) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_4096_cmp_128(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add r6, r6, #252\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x1fc\n\t" + "\n" + "L_sp_4096_cmp_128_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_4096_cmp_128_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #508]\n\t" + "LDR r5, [%[b], #508]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #504]\n\t" + "LDR r5, [%[b], #504]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #500]\n\t" + "LDR r5, [%[b], #500]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #496]\n\t" + "LDR r5, [%[b], #496]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #492]\n\t" + "LDR r5, [%[b], #492]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #488]\n\t" + "LDR r5, [%[b], #488]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #484]\n\t" + "LDR r5, [%[b], #484]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #480]\n\t" + "LDR r5, [%[b], #480]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #476]\n\t" + "LDR r5, [%[b], #476]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #472]\n\t" + "LDR r5, [%[b], #472]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #468]\n\t" + "LDR r5, [%[b], #468]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #464]\n\t" + "LDR r5, [%[b], #464]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #460]\n\t" + "LDR r5, [%[b], #460]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #456]\n\t" + "LDR r5, [%[b], #456]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #452]\n\t" + "LDR r5, [%[b], #452]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #448]\n\t" + "LDR r5, [%[b], #448]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #444]\n\t" + "LDR r5, [%[b], #444]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #440]\n\t" + "LDR r5, [%[b], #440]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #436]\n\t" + "LDR r5, [%[b], #436]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #432]\n\t" + "LDR r5, [%[b], #432]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #428]\n\t" + "LDR r5, [%[b], #428]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #424]\n\t" + "LDR r5, [%[b], #424]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #420]\n\t" + "LDR r5, [%[b], #420]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #416]\n\t" + "LDR r5, [%[b], #416]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #412]\n\t" + "LDR r5, [%[b], #412]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #408]\n\t" + "LDR r5, [%[b], #408]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #404]\n\t" + "LDR r5, [%[b], #404]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #400]\n\t" + "LDR r5, [%[b], #400]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #396]\n\t" + "LDR r5, [%[b], #396]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #392]\n\t" + "LDR r5, [%[b], #392]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #388]\n\t" + "LDR r5, [%[b], #388]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #384]\n\t" + "LDR r5, [%[b], #384]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #380]\n\t" + "LDR r5, [%[b], #380]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #376]\n\t" + "LDR r5, [%[b], #376]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #372]\n\t" + "LDR r5, [%[b], #372]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #368]\n\t" + "LDR r5, [%[b], #368]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #364]\n\t" + "LDR r5, [%[b], #364]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #360]\n\t" + "LDR r5, [%[b], #360]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #356]\n\t" + "LDR r5, [%[b], #356]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #352]\n\t" + "LDR r5, [%[b], #352]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #348]\n\t" + "LDR r5, [%[b], #348]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #344]\n\t" + "LDR r5, [%[b], #344]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #340]\n\t" + "LDR r5, [%[b], #340]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #336]\n\t" + "LDR r5, [%[b], #336]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #332]\n\t" + "LDR r5, [%[b], #332]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #328]\n\t" + "LDR r5, [%[b], #328]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #324]\n\t" + "LDR r5, [%[b], #324]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #320]\n\t" + "LDR r5, [%[b], #320]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #316]\n\t" + "LDR r5, [%[b], #316]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #312]\n\t" + "LDR r5, [%[b], #312]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #308]\n\t" + "LDR r5, [%[b], #308]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #304]\n\t" + "LDR r5, [%[b], #304]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #300]\n\t" + "LDR r5, [%[b], #300]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #296]\n\t" + "LDR r5, [%[b], #296]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #292]\n\t" + "LDR r5, [%[b], #292]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #288]\n\t" + "LDR r5, [%[b], #288]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #284]\n\t" + "LDR r5, [%[b], #284]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #280]\n\t" + "LDR r5, [%[b], #280]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #276]\n\t" + "LDR r5, [%[b], #276]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #272]\n\t" + "LDR r5, [%[b], #272]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #268]\n\t" + "LDR r5, [%[b], #268]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #264]\n\t" + "LDR r5, [%[b], #264]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #260]\n\t" + "LDR r5, [%[b], #260]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #256]\n\t" + "LDR r5, [%[b], #256]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #252]\n\t" + "LDR r5, [%[b], #252]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #248]\n\t" + "LDR r5, [%[b], #248]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #244]\n\t" + "LDR r5, [%[b], #244]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #240]\n\t" + "LDR r5, [%[b], #240]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #236]\n\t" + "LDR r5, [%[b], #236]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #232]\n\t" + "LDR r5, [%[b], #232]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #228]\n\t" + "LDR r5, [%[b], #228]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #224]\n\t" + "LDR r5, [%[b], #224]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #220]\n\t" + "LDR r5, [%[b], #220]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #216]\n\t" + "LDR r5, [%[b], #216]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #212]\n\t" + "LDR r5, [%[b], #212]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #208]\n\t" + "LDR r5, [%[b], #208]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #204]\n\t" + "LDR r5, [%[b], #204]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #200]\n\t" + "LDR r5, [%[b], #200]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #196]\n\t" + "LDR r5, [%[b], #196]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #192]\n\t" + "LDR r5, [%[b], #192]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #188]\n\t" + "LDR r5, [%[b], #188]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #184]\n\t" + "LDR r5, [%[b], #184]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #180]\n\t" + "LDR r5, [%[b], #180]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #176]\n\t" + "LDR r5, [%[b], #176]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #172]\n\t" + "LDR r5, [%[b], #172]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #168]\n\t" + "LDR r5, [%[b], #168]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #164]\n\t" + "LDR r5, [%[b], #164]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #160]\n\t" + "LDR r5, [%[b], #160]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #156]\n\t" + "LDR r5, [%[b], #156]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #152]\n\t" + "LDR r5, [%[b], #152]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #148]\n\t" + "LDR r5, [%[b], #148]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #144]\n\t" + "LDR r5, [%[b], #144]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #140]\n\t" + "LDR r5, [%[b], #140]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #136]\n\t" + "LDR r5, [%[b], #136]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #132]\n\t" + "LDR r5, [%[b], #132]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #128]\n\t" + "LDR r5, [%[b], #128]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -13816,6 +26982,7 @@ static WC_INLINE int sp_4096_mod_128(sp_digit* r, const sp_digit* a, const sp_di return sp_4096_div_128(a, m, NULL, r); } +#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */ #if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL @@ -14279,6 +27446,7 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, } #ifndef WOLFSSL_RSA_PUBLIC_ONLY +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -14287,40 +27455,292 @@ int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em, * b A single precision number to add. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #1\n\t" - "lsl r5, r5, #8\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r5, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "\n" + "L_sp_4096_cond_add_64_words_%=:\n\t" + "ADDS r5, r5, #0x-1\n\t" + "LDR r6, [%[a], r4]\n\t" + "LDR r7, [%[b], r4]\n\t" + "AND r7, r7, %[m]\n\t" + "ADCS r6, r6, r7\n\t" + "ADC r5, r8, r8\n\t" + "STR r6, [%[r], r4]\n\t" + "ADD r4, r4, #0x4\n\t" + "CMP r4, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_4096_cond_add_64_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_4096_cond_add_64_words_%=\n\t" +#endif + "MOV %[r], r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_4096_cond_add_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "ADC %[r], r10, r10\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* RSA private key operation. * * in Array of bytes representing the number to exponentiate, base. @@ -14635,798 +28055,785 @@ int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod, #ifdef WOLFSSL_HAVE_SP_DH #ifdef HAVE_FFDHE_4096 -static void sp_4096_lshift_128(sp_digit* r, const sp_digit* a, byte n) +static void sp_4096_lshift_128(sp_digit* r_p, const sp_digit* a_p, byte n_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; + __asm__ __volatile__ ( - "mov r6, #31\n\t" - "sub r6, r6, %[n]\n\t" - "add %[a], %[a], #448\n\t" - "add %[r], %[r], #448\n\t" - "ldr r3, [%[a], #60]\n\t" - "lsr r4, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r4, r4, r6\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r4, [%[a], #60]\n\t" - "str r3, [%[r], #68]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #56]\n\t" - "str r2, [%[r], #64]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #52]\n\t" - "str r4, [%[r], #60]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #48]\n\t" - "str r3, [%[r], #56]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #44]\n\t" - "str r2, [%[r], #52]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #40]\n\t" - "str r4, [%[r], #48]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #36]\n\t" - "str r3, [%[r], #44]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #32]\n\t" - "str r2, [%[r], #40]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #28]\n\t" - "str r4, [%[r], #36]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #24]\n\t" - "str r3, [%[r], #32]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #20]\n\t" - "str r2, [%[r], #28]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #16]\n\t" - "str r4, [%[r], #24]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #12]\n\t" - "str r3, [%[r], #20]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #8]\n\t" - "str r2, [%[r], #16]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #4]\n\t" - "str r4, [%[r], #12]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #0]\n\t" - "str r3, [%[r], #8]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r2, [%[r], #68]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r4, [%[a], #60]\n\t" - "str r3, [%[r], #68]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #56]\n\t" - "str r2, [%[r], #64]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #52]\n\t" - "str r4, [%[r], #60]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #48]\n\t" - "str r3, [%[r], #56]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #44]\n\t" - "str r2, [%[r], #52]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #40]\n\t" - "str r4, [%[r], #48]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #36]\n\t" - "str r3, [%[r], #44]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #32]\n\t" - "str r2, [%[r], #40]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #28]\n\t" - "str r4, [%[r], #36]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #24]\n\t" - "str r3, [%[r], #32]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #20]\n\t" - "str r2, [%[r], #28]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #16]\n\t" - "str r4, [%[r], #24]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #12]\n\t" - "str r3, [%[r], #20]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #8]\n\t" - "str r2, [%[r], #16]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #4]\n\t" - "str r4, [%[r], #12]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #0]\n\t" - "str r3, [%[r], #8]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r2, [%[r], #68]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "sub %[a], %[a], #64\n\t" - "sub %[r], %[r], #64\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "str r2, [%[r]]\n\t" - "str r3, [%[r], #4]\n\t" + "RSB r7, %[n], #0x1f\n\t" + "LDR r5, [%[a], #508]\n\t" + "LSR r6, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r6, r6, r7\n\t" + "LDR r4, [%[a], #504]\n\t" + "STR r6, [%[r], #512]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #500]\n\t" + "STR r5, [%[r], #508]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #496]\n\t" + "STR r4, [%[r], #504]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #492]\n\t" + "STR r6, [%[r], #500]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #488]\n\t" + "STR r5, [%[r], #496]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #484]\n\t" + "STR r4, [%[r], #492]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #480]\n\t" + "STR r6, [%[r], #488]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #476]\n\t" + "STR r5, [%[r], #484]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #472]\n\t" + "STR r4, [%[r], #480]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #468]\n\t" + "STR r6, [%[r], #476]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #464]\n\t" + "STR r5, [%[r], #472]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #460]\n\t" + "STR r4, [%[r], #468]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #456]\n\t" + "STR r6, [%[r], #464]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #452]\n\t" + "STR r5, [%[r], #460]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #448]\n\t" + "STR r4, [%[r], #456]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #444]\n\t" + "STR r6, [%[r], #452]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #440]\n\t" + "STR r5, [%[r], #448]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #436]\n\t" + "STR r4, [%[r], #444]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #432]\n\t" + "STR r6, [%[r], #440]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #428]\n\t" + "STR r5, [%[r], #436]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #424]\n\t" + "STR r4, [%[r], #432]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #420]\n\t" + "STR r6, [%[r], #428]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #416]\n\t" + "STR r5, [%[r], #424]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #412]\n\t" + "STR r4, [%[r], #420]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #408]\n\t" + "STR r6, [%[r], #416]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #404]\n\t" + "STR r5, [%[r], #412]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #400]\n\t" + "STR r4, [%[r], #408]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #396]\n\t" + "STR r6, [%[r], #404]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #392]\n\t" + "STR r5, [%[r], #400]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #388]\n\t" + "STR r4, [%[r], #396]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #384]\n\t" + "STR r6, [%[r], #392]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #380]\n\t" + "STR r5, [%[r], #388]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #376]\n\t" + "STR r4, [%[r], #384]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #372]\n\t" + "STR r6, [%[r], #380]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #368]\n\t" + "STR r5, [%[r], #376]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #364]\n\t" + "STR r4, [%[r], #372]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #360]\n\t" + "STR r6, [%[r], #368]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #356]\n\t" + "STR r5, [%[r], #364]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #352]\n\t" + "STR r4, [%[r], #360]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #348]\n\t" + "STR r6, [%[r], #356]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #344]\n\t" + "STR r5, [%[r], #352]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #340]\n\t" + "STR r4, [%[r], #348]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #336]\n\t" + "STR r6, [%[r], #344]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #332]\n\t" + "STR r5, [%[r], #340]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #328]\n\t" + "STR r4, [%[r], #336]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #324]\n\t" + "STR r6, [%[r], #332]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #320]\n\t" + "STR r5, [%[r], #328]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #316]\n\t" + "STR r4, [%[r], #324]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #312]\n\t" + "STR r6, [%[r], #320]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #308]\n\t" + "STR r5, [%[r], #316]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #304]\n\t" + "STR r4, [%[r], #312]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #300]\n\t" + "STR r6, [%[r], #308]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #296]\n\t" + "STR r5, [%[r], #304]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #292]\n\t" + "STR r4, [%[r], #300]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #288]\n\t" + "STR r6, [%[r], #296]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #284]\n\t" + "STR r5, [%[r], #292]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #280]\n\t" + "STR r4, [%[r], #288]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #276]\n\t" + "STR r6, [%[r], #284]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #272]\n\t" + "STR r5, [%[r], #280]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #268]\n\t" + "STR r4, [%[r], #276]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #264]\n\t" + "STR r6, [%[r], #272]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #260]\n\t" + "STR r5, [%[r], #268]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #256]\n\t" + "STR r4, [%[r], #264]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #252]\n\t" + "STR r6, [%[r], #260]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #248]\n\t" + "STR r5, [%[r], #256]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #244]\n\t" + "STR r4, [%[r], #252]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #240]\n\t" + "STR r6, [%[r], #248]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #236]\n\t" + "STR r5, [%[r], #244]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #232]\n\t" + "STR r4, [%[r], #240]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #228]\n\t" + "STR r6, [%[r], #236]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #224]\n\t" + "STR r5, [%[r], #232]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #220]\n\t" + "STR r4, [%[r], #228]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #216]\n\t" + "STR r6, [%[r], #224]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #212]\n\t" + "STR r5, [%[r], #220]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #208]\n\t" + "STR r4, [%[r], #216]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #204]\n\t" + "STR r6, [%[r], #212]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #200]\n\t" + "STR r5, [%[r], #208]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #196]\n\t" + "STR r4, [%[r], #204]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #192]\n\t" + "STR r6, [%[r], #200]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #188]\n\t" + "STR r5, [%[r], #196]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #184]\n\t" + "STR r4, [%[r], #192]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #180]\n\t" + "STR r6, [%[r], #188]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #176]\n\t" + "STR r5, [%[r], #184]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #172]\n\t" + "STR r4, [%[r], #180]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #168]\n\t" + "STR r6, [%[r], #176]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #164]\n\t" + "STR r5, [%[r], #172]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #160]\n\t" + "STR r4, [%[r], #168]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #156]\n\t" + "STR r6, [%[r], #164]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #152]\n\t" + "STR r5, [%[r], #160]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #148]\n\t" + "STR r4, [%[r], #156]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #144]\n\t" + "STR r6, [%[r], #152]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #140]\n\t" + "STR r5, [%[r], #148]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #136]\n\t" + "STR r4, [%[r], #144]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #132]\n\t" + "STR r6, [%[r], #140]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #128]\n\t" + "STR r5, [%[r], #136]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #124]\n\t" + "STR r4, [%[r], #132]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #120]\n\t" + "STR r6, [%[r], #128]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #116]\n\t" + "STR r5, [%[r], #124]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #112]\n\t" + "STR r4, [%[r], #120]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #108]\n\t" + "STR r6, [%[r], #116]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #104]\n\t" + "STR r5, [%[r], #112]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #100]\n\t" + "STR r4, [%[r], #108]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #96]\n\t" + "STR r6, [%[r], #104]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #92]\n\t" + "STR r5, [%[r], #100]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #88]\n\t" + "STR r4, [%[r], #96]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #84]\n\t" + "STR r6, [%[r], #92]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #80]\n\t" + "STR r5, [%[r], #88]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #76]\n\t" + "STR r4, [%[r], #84]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #72]\n\t" + "STR r6, [%[r], #80]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #68]\n\t" + "STR r5, [%[r], #76]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #64]\n\t" + "STR r4, [%[r], #72]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #60]\n\t" + "STR r6, [%[r], #68]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #56]\n\t" + "STR r5, [%[r], #64]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #52]\n\t" + "STR r4, [%[r], #60]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #48]\n\t" + "STR r6, [%[r], #56]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #44]\n\t" + "STR r5, [%[r], #52]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #40]\n\t" + "STR r4, [%[r], #48]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #36]\n\t" + "STR r6, [%[r], #44]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #32]\n\t" + "STR r5, [%[r], #40]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #28]\n\t" + "STR r4, [%[r], #36]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #24]\n\t" + "STR r6, [%[r], #32]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #20]\n\t" + "STR r5, [%[r], #28]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #16]\n\t" + "STR r4, [%[r], #24]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #12]\n\t" + "STR r6, [%[r], #20]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #8]\n\t" + "STR r5, [%[r], #16]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #4]\n\t" + "STR r4, [%[r], #12]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a]]\n\t" + "STR r6, [%[r], #8]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "STR r4, [%[r]]\n\t" + "STR r5, [%[r], #4]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" + : "memory", "r4", "r5", "r6", "r3", "r7" ); } @@ -15712,866 +29119,1022 @@ static const sp_digit p256_b[8] = { }; #endif +#ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[8]; - sp_digit* tmp = tmp_arr; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - /* A[0] * B[0] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[1] */ - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - /* A[1] * B[0] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[1] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[0] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[1] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[0] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[1] * B[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[2] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[1] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[0] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * B[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * B[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[2] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[1] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[0] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * B[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[1] * B[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[2] * B[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[3] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[2] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[1] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[0] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * B[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[1] * B[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[2] * B[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[3] * B[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[3] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[2] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[1] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[0] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * B[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[2] * B[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[3] * B[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[4] * B[4] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[3] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[2] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[1] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * B[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[3] * B[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[4] * B[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[5] * B[4] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[3] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[2] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * B[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[4] * B[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[5] * B[5] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[6] * B[4] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[3] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * B[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * B[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[6] * B[5] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[7] * B[4] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * B[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * B[6] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[7] * B[5] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * B[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - /* A[7] * B[6] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * B[7] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" + "SUB sp, sp, #0x40\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_256_mul_8_outer_%=:\n\t" + "SUBS r3, r5, #0x1c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_256_mul_8_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x20\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_mul_8_inner_done_%=\n\t" +#else + "BEQ.N L_sp_256_mul_8_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_256_mul_8_inner_%=\n\t" +#else + "BLE.N L_sp_256_mul_8_inner_%=\n\t" +#endif + "\n" + "L_sp_256_mul_8_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x38\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_256_mul_8_outer_%=\n\t" +#else + "BLE.N L_sp_256_mul_8_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_256_mul_8_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_256_mul_8_store_%=\n\t" +#else + "BGT.N L_sp_256_mul_8_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } +#else +#ifdef WOLFSSL_SP_NO_UMAAL +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x24\n\t" + "STR %[r], [sp, #32]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "LDR lr, [%[b]]\n\t" + "UMULL r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "UMULL r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "UMULL r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "UMULL r9, r10, r12, lr\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "MOV r11, %[r]\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[0] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[0] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[0] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r3, %[r], #0x0\n\t" + "UMLAL r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "STR r4, [sp, #4]\n\t" + "ADDS r5, r5, r11\n\t" + /* A[1] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[1] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "STR r5, [sp, #8]\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[2] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[2] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[3] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[3] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[3] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[4] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[4] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[4] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[4] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[4] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[5] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[5] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[5] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[5] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[5] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[6] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[6] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[6] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[6] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[6] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[6] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r9, %[r], #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "LDR r12, [%[a], #28]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[7] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[7] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[7] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[7] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[7] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[7] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r10, %[r], #0x0\n\t" + "UMLAL r9, r10, r12, lr\n\t" + "LDR %[r], [sp, #32]\n\t" + "ADD %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x24\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x2c\n\t" + "STRD %[r], %[a], [sp, #36]\n\t" + "MOV lr, %[b]\n\t" + "LDM %[a], {%[r], %[a], %[b], r3}\n\t" + "LDM lr!, {r4, r5, r6}\n\t" + "UMULL r10, r11, %[r], r4\n\t" + "UMULL r12, r7, %[a], r4\n\t" + "UMAAL r11, r12, %[r], r5\n\t" + "UMULL r8, r9, %[b], r4\n\t" + "UMAAL r12, r8, %[a], r5\n\t" + "UMAAL r12, r7, %[r], r6\n\t" + "UMAAL r8, r9, r3, r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + "UMAAL r7, r8, %[b], r5\n\t" + "LDM lr!, {r4}\n\t" + "UMULL r10, r11, %[a], r6\n\t" + "UMAAL r8, r9, %[b], r6\n\t" + "UMAAL r7, r10, %[r], r4\n\t" + "UMAAL r8, r11, r3, r5\n\t" + "STR r7, [sp, #12]\n\t" + "UMAAL r8, r10, %[a], r4\n\t" + "UMAAL r9, r11, r3, r6\n\t" + "UMAAL r9, r10, %[b], r4\n\t" + "UMAAL r10, r11, r3, r4\n\t" + "LDM lr, {r4, r5, r6, r7}\n\t" + "MOV r12, #0x0\n\t" + "UMLAL r8, r12, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r10, r12, %[b], r4\n\t" + "UMAAL r11, r12, r3, r4\n\t" + "MOV r4, #0x0\n\t" + "UMLAL r9, r4, %[r], r5\n\t" + "UMAAL r10, r4, %[a], r5\n\t" + "UMAAL r11, r4, %[b], r5\n\t" + "UMAAL r12, r4, r3, r5\n\t" + "MOV r5, #0x0\n\t" + "UMLAL r10, r5, %[r], r6\n\t" + "UMAAL r11, r5, %[a], r6\n\t" + "UMAAL r12, r5, %[b], r6\n\t" + "UMAAL r4, r5, r3, r6\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r11, r6, %[r], r7\n\t" + "LDR %[r], [sp, #40]\n\t" + "UMAAL r12, r6, %[a], r7\n\t" + "ADD %[r], %[r], #0x10\n\t" + "UMAAL r4, r6, %[b], r7\n\t" + "SUB lr, lr, #0x10\n\t" + "UMAAL r5, r6, r3, r7\n\t" + "LDM %[r], {%[r], %[a], %[b], r3}\n\t" + "STR r6, [sp, #32]\n\t" + "LDM lr!, {r6}\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r8, r7, %[r], r6\n\t" + "UMAAL r9, r7, %[a], r6\n\t" + "STR r8, [sp, #16]\n\t" + "UMAAL r10, r7, %[b], r6\n\t" + "UMAAL r11, r7, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r8, #0x0\n\t" + "UMLAL r9, r8, %[r], r6\n\t" + "UMAAL r10, r8, %[a], r6\n\t" + "STR r9, [sp, #20]\n\t" + "UMAAL r11, r8, %[b], r6\n\t" + "UMAAL r12, r8, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r10, r9, %[r], r6\n\t" + "UMAAL r11, r9, %[a], r6\n\t" + "STR r10, [sp, #24]\n\t" + "UMAAL r12, r9, %[b], r6\n\t" + "UMAAL r4, r9, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r10, #0x0\n\t" + "UMLAL r11, r10, %[r], r6\n\t" + "UMAAL r12, r10, %[a], r6\n\t" + "STR r11, [sp, #28]\n\t" + "UMAAL r4, r10, %[b], r6\n\t" + "UMAAL r5, r10, r3, r6\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r12, r7, %[r], r11\n\t" + "UMAAL r4, r7, %[a], r11\n\t" + "LDR r6, [sp, #32]\n\t" + "UMAAL r5, r7, %[b], r11\n\t" + "UMAAL r6, r7, r3, r11\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r4, r8, %[r], r11\n\t" + "UMAAL r5, r8, %[a], r11\n\t" + "UMAAL r6, r8, %[b], r11\n\t" + "UMAAL r7, r8, r3, r11\n\t" + "LDM lr, {r11, lr}\n\t" + "UMAAL r5, r9, %[r], r11\n\t" + "UMAAL r6, r10, %[r], lr\n\t" + "UMAAL r6, r9, %[a], r11\n\t" + "UMAAL r7, r10, %[a], lr\n\t" + "UMAAL r7, r9, %[b], r11\n\t" + "UMAAL r8, r10, %[b], lr\n\t" + "UMAAL r8, r9, r3, r11\n\t" + "UMAAL r9, r10, r3, lr\n\t" + "MOV r3, r12\n\t" + "LDR lr, [sp, #36]\n\t" + "ADD lr, lr, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB lr, lr, #0x20\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x2c\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); +} + +#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) +static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) { - sp_digit tmp_arr[8]; - sp_digit* tmp = tmp_arr; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - /* A[0] * A[0] */ - "ldr r6, [%[a], #0]\n\t" - "umull r3, r4, r6, r6\n\t" - "mov r5, #0\n\t" - "str r3, [%[tmp], #0]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[1] */ - "ldr r8, [%[a], #4]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[tmp], #4]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[tmp], #8]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #12]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[2] */ - "ldr r6, [%[a], #8]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #16]\n\t" - "mov r4, #0\n\t" - /* A[0] * A[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[tmp], #20]\n\t" - "mov r5, #0\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[3] */ - "ldr r6, [%[a], #12]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[tmp], #24]\n\t" - "mov r3, #0\n\t" - /* A[0] * A[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[1] * A[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[2] * A[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[tmp], #28]\n\t" - "mov r4, #0\n\t" - /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[4] */ - "ldr r6, [%[a], #16]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r5, r5, r9\n\t" - "adcs r3, r3, r10\n\t" - "adc r4, r4, r11\n\t" - "str r5, [%[r], #32]\n\t" - "mov r5, #0\n\t" - /* A[2] * A[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[3] * A[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[4] * A[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adc r5, r5, r11\n\t" - "str r3, [%[r], #36]\n\t" - "mov r3, #0\n\t" - /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r9, r10, r6, r8\n\t" - "mov r11, #0\n\t" - /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r9, r9, r6\n\t" - "adcs r10, r10, r8\n\t" - "adc r11, r11, #0\n\t" - /* A[5] * A[5] */ - "ldr r6, [%[a], #20]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adc r11, r11, r11\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adc r3, r3, r11\n\t" - "str r4, [%[r], #40]\n\t" - "mov r4, #0\n\t" - /* A[4] * A[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - /* A[5] * A[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adcs r3, r3, r8\n\t" - "adc r4, r4, #0\n\t" - "str r5, [%[r], #44]\n\t" - "mov r5, #0\n\t" - /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[6] * A[6] */ - "ldr r6, [%[a], #24]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - "str r3, [%[r], #48]\n\t" - "mov r3, #0\n\t" - /* A[6] * A[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r3, r3, #0\n\t" - "str r4, [%[r], #52]\n\t" - "mov r4, #0\n\t" - /* A[7] * A[7] */ - "ldr r6, [%[a], #28]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r5, r5, r6\n\t" - "adc r3, r3, r8\n\t" - "str r5, [%[r], #56]\n\t" - "str r3, [%[r], #60]\n\t" - /* Transfer tmp to r */ - "ldr r3, [%[tmp], #0]\n\t" - "ldr r4, [%[tmp], #4]\n\t" - "ldr r5, [%[tmp], #8]\n\t" - "ldr r6, [%[tmp], #12]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[tmp], #16]\n\t" - "ldr r4, [%[tmp], #20]\n\t" - "ldr r5, [%[tmp], #24]\n\t" - "ldr r6, [%[tmp], #28]\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" + "SUB sp, sp, #0x40\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_256_sqr_8_outer_%=:\n\t" + "SUBS r3, r5, #0x1c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_256_sqr_8_inner_%=:\n\t" + "CMP r4, r3\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_sqr_8_op_sqr_%=\n\t" +#else + "BEQ.N L_sp_256_sqr_8_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_256_sqr_8_op_done_%=\n\t" + "\n" + "L_sp_256_sqr_8_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_256_sqr_8_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x20\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_sqr_8_inner_done_%=\n\t" +#else + "BEQ.N L_sp_256_sqr_8_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_256_sqr_8_inner_done_%=\n\t" +#else + "BGT.N L_sp_256_sqr_8_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_256_sqr_8_inner_%=\n\t" +#else + "BLE.N L_sp_256_sqr_8_inner_%=\n\t" +#endif + "\n" + "L_sp_256_sqr_8_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x38\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_256_sqr_8_outer_%=\n\t" +#else + "BLE.N L_sp_256_sqr_8_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_256_sqr_8_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_256_sqr_8_store_%=\n\t" +#else + "BGT.N L_sp_256_sqr_8_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } +#else +#ifdef WOLFSSL_SP_NO_UMAAL +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + "STR %[r], [sp, #64]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" + /* A[0] * A[1] */ + "LDR lr, [%[a], #4]\n\t" + "UMULL r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "LDR lr, [%[a], #12]\n\t" + "UMULL r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "UMULL r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "UMULL r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[0] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[0] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + "STR r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * A[3] */ + "LDR lr, [%[a], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[a], #12]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[a], #16]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[a], #20]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[a], #24]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[a], #28]\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "STM lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADC r10, %[r], #0x0\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV lr, sp\n\t" + /* A[0] * A[0] */ + "LDR r12, [%[a]]\n\t" + "UMULL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[1] * A[1] */ + "LDR r12, [%[a], #4]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * A[2] */ + "LDR r12, [%[a], #8]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * A[3] */ + "LDR r12, [%[a], #12]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, r12\n\t" + "ADDS r10, r10, r11\n\t" + "STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "LDR r12, [%[a], #16]\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * A[5] */ + "LDR r12, [%[a], #20]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * A[6] */ + "LDR r12, [%[a], #24]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * A[7] */ + "LDR r12, [%[a], #28]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r12, r12\n\t" + "LDR %[r], [sp, #64]\n\t" + "ADD %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM sp, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "SUB %[r], %[r], #0x20\n\t" + "STM %[r], {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x20\n\t" + "STR %[r], [sp, #28]\n\t" + "LDM %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "UMULL r9, r10, %[r], %[r]\n\t" + "UMULL r11, r12, %[r], %[a]\n\t" + "ADDS r11, r11, r11\n\t" + "MOV lr, #0x0\n\t" + "UMAAL r10, r11, lr, lr\n\t" + "STM sp, {r9, r10}\n\t" + "MOV r8, lr\n\t" + "UMAAL r8, r12, %[r], r2\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r8, r11, %[a], %[a]\n\t" + "UMULL r9, r10, %[r], r3\n\t" + "UMAAL r9, r12, %[a], r2\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STRD r8, r9, [sp, #8]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r10, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r2, r2\n\t" + "STR r9, [sp, #16]\n\t" + "UMULL r9, r8, %[r], r5\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r9, r10, r2, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STR r9, [sp, #20]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r8, %[r], r6\n\t" + "UMAAL r9, r12, %[a], r5\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r3, r3\n\t" + "STR r9, [sp, #24]\n\t" + "UMULL %[r], r9, %[r], r7\n\t" + "UMAAL %[r], r8, %[a], r6\n\t" + "UMAAL %[r], r12, r2, r5\n\t" + "UMAAL %[r], r10, r3, r4\n\t" + "ADCS %[r], %[r], %[r]\n\t" + "UMAAL %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "UMAAL r9, r8, %[a], r7\n\t" + "UMAAL r9, r10, r2, r6\n\t" + "UMAAL r12, r9, r3, r5\n\t" + "ADCS r12, r12, r12\n\t" + "UMAAL r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "UMAAL r9, r8, r2, r7\n\t" + "UMAAL r10, r9, r3, r6\n\t" + "MOV r2, lr\n\t" + "UMAAL r10, r2, r4, r5\n\t" + "ADCS r10, r10, r10\n\t" + "UMAAL r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "UMAAL r2, r8, r3, r7\n\t" + "UMAAL r2, r9, r4, r6\n\t" + "ADCS r3, r2, r2\n\t" + "UMAAL r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "MOV %[a], lr\n\t" + "UMAAL %[a], r8, r4, r7\n\t" + "UMAAL %[a], r9, r5, r6\n\t" + "ADCS r4, %[a], %[a]\n\t" + "UMAAL r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "UMAAL r8, r9, r5, r7\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "MOV r5, lr\n\t" + "UMAAL r5, r9, r6, r7\n\t" + "ADCS r5, r5, r5\n\t" + "UMAAL r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r5, r7, r7\n\t" + "ADCS r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "LDR lr, [sp, #28]\n\t" + "ADD lr, lr, #0x1c\n\t" + "STM lr!, {%[r], r12}\n\t" + "STM lr!, {r11}\n\t" + "STM lr!, {r10}\n\t" + "STM lr!, {r3, r4, r8, r9}\n\t" + "STM lr!, {r7}\n\t" + "SUB lr, lr, #0x40\n\t" + "LDM sp, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "STM lr, {%[r], %[a], r2, r3, r4, r5, r6}\n\t" + "ADD sp, sp, #0x20\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); +} + +#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * @@ -16579,39 +30142,39 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #32\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x20\n\t" + "\n" + "L_sp_256_add_8_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_256_add_8_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_256_add_8_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -16621,40 +30184,34 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -16665,37 +30222,38 @@ SP_NOINLINE static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "add r6, r6, #32\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x20\n\t" + "\n" + "L_sp_256_sub_8_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_256_sub_8_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_256_sub_8_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -16705,39 +30263,33 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -16747,258 +30299,234 @@ SP_NOINLINE static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, * a The number to convert. * m The modulus (prime). */ -static int sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a, const sp_digit* m) +static int sp_256_mod_mul_norm_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "sub sp, sp, #24\n\t" - "ldr r2, [%[a], #0]\n\t" - "ldr r3, [%[a], #4]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #12]\n\t" - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "ldr r9, [%[a], #24]\n\t" - "ldr r10, [%[a], #28]\n\t" + "SUB sp, sp, #0x18\n\t" + "LDM %[a], {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" /* Clear overflow and underflow */ - "mov r14, #0\n\t" - "mov r12, #0\n\t" - /* t[0] = 1 1 0 -1 -1 -1 -1 0 */ - "adds r11, r2, r3\n\t" - "adc r14, r14, #0\n\t" - "subs r11, r11, r5\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r6\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r8\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r9\n\t" - "sbc r12, r12, #0\n\t" + "MOV r11, #0x0\n\t" + "MOV r12, #0x0\n\t" + "# t[0] = 1 1 0 -1 -1 -1 -1 0\n\t" + "ADDS r10, r2, r3\n\t" + "ADC r11, r11, #0x0\n\t" + "SUBS r10, r10, r5\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r6\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r7\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r8\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[0] */ - "str r11, [sp, #0]\n\t" - "neg r12, r12\n\t" - "mov r11, #0\n\t" - /* t[1] = 0 1 1 0 -1 -1 -1 -1 */ - "adds r14, r14, r3\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r4\n\t" - "adc r11, r11, #0\n\t" - "subs r14, r14, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r6\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r8\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r9\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r10\n\t" - "sbc r12, r12, #0\n\t" + "STR r10, [sp]\n\t" + "neg r12, r12\n\t" + "MOV r10, #0x0\n\t" + "# t[1] = 0 1 1 0 -1 -1 -1 -1\n\t" + "ADDS r11, r11, r3\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r4\n\t" + "ADC r10, r10, #0x0\n\t" + "SUBS r11, r11, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r11, r11, r6\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r7\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r8\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r9\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[1] */ - "str r14, [sp, #4]\n\t" - "neg r12, r12\n\t" - "mov r14, #0\n\t" - /* t[2] = 0 0 1 1 0 -1 -1 -1 */ - "adds r11, r11, r4\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r5\n\t" - "adc r14, r14, #0\n\t" - "subs r11, r11, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r8\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r9\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r10\n\t" - "sbc r12, r12, #0\n\t" + "STR r11, [sp, #4]\n\t" + "neg r12, r12\n\t" + "MOV r11, #0x0\n\t" + "# t[2] = 0 0 1 1 0 -1 -1 -1\n\t" + "ADDS r10, r10, r4\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r5\n\t" + "ADC r11, r11, #0x0\n\t" + "SUBS r10, r10, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r10, r10, r7\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r8\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r9\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[2] */ - "str r11, [sp, #8]\n\t" - "neg r12, r12\n\t" - "mov r11, #0\n\t" - /* t[3] = -1 -1 0 2 2 1 0 -1 */ - "adds r14, r14, r5\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r5\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r6\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r6\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r8\n\t" - "adc r11, r11, #0\n\t" - "subs r14, r14, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r2\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r3\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r10\n\t" - "sbc r12, r12, #0\n\t" + "STR r10, [sp, #8]\n\t" + "neg r12, r12\n\t" + "MOV r10, #0x0\n\t" + "# t[3] = -1 -1 0 2 2 1 0 -1\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r6\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r6\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r7\n\t" + "ADC r10, r10, #0x0\n\t" + "SUBS r11, r11, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r11, r11, r2\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r3\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r9\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[3] */ - "str r14, [sp, #12]\n\t" - "neg r12, r12\n\t" - "mov r14, #0\n\t" - /* t[4] = 0 -1 -1 0 2 2 1 0 */ - "adds r11, r11, r6\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r6\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r8\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r8\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r9\n\t" - "adc r14, r14, #0\n\t" - "subs r11, r11, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r3\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r4\n\t" - "sbc r12, r12, #0\n\t" + "STR r11, [sp, #12]\n\t" + "neg r12, r12\n\t" + "MOV r11, #0x0\n\t" + "# t[4] = 0 -1 -1 0 2 2 1 0\n\t" + "ADDS r10, r10, r6\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r6\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r7\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r7\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r8\n\t" + "ADC r11, r11, #0x0\n\t" + "SUBS r10, r10, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r10, r10, r3\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r4\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[4] */ - "str r11, [sp, #16]\n\t" - "neg r12, r12\n\t" - "mov r11, #0\n\t" - /* t[5] = 0 0 -1 -1 0 2 2 1 */ - "adds r14, r14, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r8\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r9\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r9\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r10\n\t" - "adc r11, r11, #0\n\t" - "subs r14, r14, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r4\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r5\n\t" - "sbc r12, r12, #0\n\t" + "STR r10, [sp, #16]\n\t" + "neg r12, r12\n\t" + "MOV r10, #0x0\n\t" + "# t[5] = 0 0 -1 -1 0 2 2 1\n\t" + "ADDS r11, r11, r7\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r7\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r8\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r8\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r9\n\t" + "ADC r10, r10, #0x0\n\t" + "SUBS r11, r11, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r11, r11, r4\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r5\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[5] */ - "str r14, [sp, #20]\n\t" - "neg r12, r12\n\t" - "mov r14, #0\n\t" - /* t[6] = -1 -1 0 0 0 1 3 2 */ - "adds r11, r11, r8\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r9\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r9\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r9\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r10\n\t" - "adc r14, r14, #0\n\t" - "adds r11, r11, r10\n\t" - "adc r14, r14, #0\n\t" - "subs r11, r11, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r2\n\t" - "sbc r12, r12, #0\n\t" - "subs r11, r11, r3\n\t" - "sbc r12, r12, #0\n\t" + "STR r11, [sp, #20]\n\t" + "neg r12, r12\n\t" + "MOV r11, #0x0\n\t" + "# t[6] = -1 -1 0 0 0 1 3 2\n\t" + "ADDS r10, r10, r7\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r8\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r8\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r8\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r9\n\t" + "ADC r11, r11, #0x0\n\t" + "ADDS r10, r10, r9\n\t" + "ADC r11, r11, #0x0\n\t" + "SUBS r10, r10, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r10, r10, r2\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r10, r10, r3\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[6] */ - "mov r9, r11\n\t" - "neg r12, r12\n\t" - "mov r11, #0\n\t" - /* t[7] = 1 0 -1 -1 -1 -1 0 3 */ - "adds r14, r14, r2\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r10\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r10\n\t" - "adc r11, r11, #0\n\t" - "adds r14, r14, r10\n\t" - "adc r11, r11, #0\n\t" - "subs r14, r14, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r4\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r5\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r6\n\t" - "sbc r12, r12, #0\n\t" - "subs r14, r14, r8\n\t" - "sbc r12, r12, #0\n\t" + "MOV r8, r10\n\t" + "neg r12, r12\n\t" + "MOV r10, #0x0\n\t" + "# t[7] = 1 0 -1 -1 -1 -1 0 3\n\t" + "ADDS r11, r11, r2\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r9\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r9\n\t" + "ADC r10, r10, #0x0\n\t" + "ADDS r11, r11, r9\n\t" + "ADC r10, r10, #0x0\n\t" + "SUBS r11, r11, r12\n\t" + "SBC r12, r12, r12\n\t" + "SUBS r11, r11, r4\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r5\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r6\n\t" + "SBC r12, r12, #0x0\n\t" + "SUBS r11, r11, r7\n\t" + "SBC r12, r12, #0x0\n\t" /* Store t[7] */ /* Load intermediate */ - "ldr r2, [sp, #0]\n\t" - "ldr r3, [sp, #4]\n\t" - "ldr r4, [sp, #8]\n\t" - "ldr r5, [sp, #12]\n\t" - "ldr r6, [sp, #16]\n\t" - "ldr r8, [sp, #20]\n\t" - "neg r12, r12\n\t" - /* Add overflow */ - /* Subtract underflow - add neg underflow */ - "adds r2, r2, r11\n\t" - "adcs r3, r3, #0\n\t" - "adcs r4, r4, #0\n\t" - "adds r5, r5, r12\n\t" - "adcs r6, r6, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, r12\n\t" - "adcs r14, r14, r11\n\t" - "mov r10, #0\n\t" - "adc r10, r10, #0\n\t" - /* Subtract overflow */ - /* Add underflow - subtract neg underflow */ - "subs r2, r2, r12\n\t" - "sbcs r3, r3, #0\n\t" - "sbcs r4, r4, #0\n\t" - "subs r5, r5, r11\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, r11\n\t" - "sbcs r14, r14, r12\n\t" - "mov r12, #0\n\t" - "sbc r12, r12, #0\n\t" + "LDM sp, {r2, r3, r4, r5, r6, r7}\n\t" "neg r12, r12\n\t" /* Add overflow */ /* Subtract underflow - add neg underflow */ - "adds r2, r2, r10\n\t" - "adcs r3, r3, #0\n\t" - "adcs r4, r4, #0\n\t" - "adds r5, r5, r12\n\t" - "adcs r6, r6, #0\n\t" - "adcs r8, r8, #0\n\t" - "adcs r9, r9, r12\n\t" - "adc r14, r14, r10\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, r12\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, r12\n\t" + "ADCS r11, r11, r10\n\t" + "MOV r9, #0x0\n\t" + "ADC r9, r9, #0x0\n\t" /* Subtract overflow */ /* Add underflow - subtract neg underflow */ - "subs r2, r2, r12\n\t" - "sbcs r3, r3, #0\n\t" - "sbcs r4, r4, #0\n\t" - "subs r5, r5, r10\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, r10\n\t" - "sbc r14, r14, r12\n\t" + "SUBS r2, r2, r12\n\t" + "SBCS r3, r3, #0x0\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, r10\n\t" + "SBCS r11, r11, r12\n\t" + "MOV r12, #0x0\n\t" + "SBC r12, r12, #0x0\n\t" + "neg r12, r12\n\t" + /* Add overflow */ + /* Subtract underflow - add neg underflow */ + "ADDS r2, r2, r9\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, r12\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, r12\n\t" + "ADC r11, r11, r9\n\t" + /* Subtract overflow */ + /* Add underflow - subtract neg underflow */ + "SUBS r2, r2, r12\n\t" + "SBCS r3, r3, #0x0\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, r9\n\t" + "SBC r11, r11, r12\n\t" /* Store result */ - "str r2, [%[r], #0]\n\t" - "str r3, [%[r], #4]\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" - "str r6, [%[r], #16]\n\t" - "str r8, [%[r], #20]\n\t" - "str r9, [%[r], #24]\n\t" - "str r14, [%[r], #28]\n\t" - "add sp, sp, #24\n\t" + "STM %[r], {r2, r3, r4, r5, r6, r7, r8, r11}\n\t" + "MOV %[r], #0x0\n\t" + "ADD sp, sp, #0x18\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r12" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return MP_OKAY; + (void)m_p; + return (uint32_t)(size_t)r; } /* Convert an mp_int to an array of sp_digit. @@ -17011,14 +30539,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -17196,6 +30724,7 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm) return err; } +#ifdef WOLFSSL_SP_NO_UMAAL /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -17205,656 +30734,725 @@ static int sp_256_point_to_ecc_point_8(const sp_point_256* p, ecc_point* pm) * m Modulus (prime). * mp Montgomery multiplier. */ -SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m, sp_digit mp) +static void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p) { - (void)mp; - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "sub sp, sp, #68\n\t" - "mov r5, #0\n\t" - /* A[0] * B[0] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r9, r10, r6, r8\n\t" - "str r9, [sp, #0]\n\t" - /* A[0] * B[1] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adc r11, r4, #0\n\t" - /* A[1] * B[0] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - "str r10, [sp, #4]\n\t" - /* A[0] * B[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adc r14, r4, r14\n\t" - /* A[1] * B[1] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, #0\n\t" - /* A[2] * B[0] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - "str r11, [sp, #8]\n\t" - /* A[0] * B[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - /* A[1] * B[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[2] * B[1] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[3] * B[0] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "str r14, [sp, #12]\n\t" - /* A[0] * B[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - /* A[1] * B[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[2] * B[2] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[3] * B[1] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[4] * B[0] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - "str r9, [sp, #16]\n\t" - /* A[0] * B[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - /* A[1] * B[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[2] * B[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[3] * B[2] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[4] * B[1] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[5] * B[0] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - "str r10, [sp, #20]\n\t" - /* A[0] * B[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, #0\n\t" - /* A[1] * B[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[2] * B[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[3] * B[3] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[4] * B[2] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[5] * B[1] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[6] * B[0] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - "str r11, [sp, #24]\n\t" - /* A[0] * B[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - /* A[1] * B[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[2] * B[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[3] * B[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[4] * B[3] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[5] * B[2] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[6] * B[1] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[7] * B[0] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #0]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "str r14, [sp, #28]\n\t" - /* A[1] * B[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - /* A[2] * B[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[3] * B[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[4] * B[4] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[5] * B[3] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[6] * B[2] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[7] * B[1] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #4]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - "str r9, [sp, #32]\n\t" - /* A[2] * B[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - /* A[3] * B[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[4] * B[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[5] * B[4] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[6] * B[3] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[7] * B[2] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - "str r10, [sp, #36]\n\t" - /* A[3] * B[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, #0\n\t" - /* A[4] * B[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[5] * B[5] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[6] * B[4] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[7] * B[3] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - "str r11, [sp, #40]\n\t" - /* A[4] * B[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - /* A[5] * B[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[6] * B[5] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[7] * B[4] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "str r14, [sp, #44]\n\t" - /* A[5] * B[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - /* A[6] * B[6] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[7] * B[5] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[6] * B[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - /* A[7] * B[6] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[7] * B[7] */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[b], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adc r14, r4, r14\n\t" - "str r9, [sp, #48]\n\t" - "str r10, [sp, #52]\n\t" - "str r11, [sp, #56]\n\t" - "str r14, [sp, #60]\n\t" + "SUB sp, sp, #0x44\n\t" + "STR %[r], [sp, #64]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" + /* A[0] * B[0] */ + "LDR lr, [%[b]]\n\t" + "UMULL r3, r4, r12, lr\n\t" + /* A[0] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "UMULL r5, r6, r12, lr\n\t" + /* A[0] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "UMULL r7, r8, r12, lr\n\t" + /* A[0] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "UMULL r9, r10, r12, lr\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "MOV r11, %[r]\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[0] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[0] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[0] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADC r3, %[r], #0x0\n\t" + "UMLAL r10, r3, r12, lr\n\t" + /* A[1] * B[0] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "STR r4, [sp, #4]\n\t" + "ADDS r5, r5, r11\n\t" + /* A[1] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[1] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * B[0] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "STR r5, [sp, #8]\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[2] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[2] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * B[0] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[3] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[3] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[3] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * B[0] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[4] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[4] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[4] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[4] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[4] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * B[0] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[5] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[5] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[5] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[5] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[5] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * B[0] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[6] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[6] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[6] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[6] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[6] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[6] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r9, %[r], #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + /* A[7] * B[0] */ + "LDR r12, [%[a], #28]\n\t" + "LDR lr, [%[b]]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[7] * B[1] */ + "LDR lr, [%[b], #4]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[7] * B[2] */ + "LDR lr, [%[b], #8]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[7] * B[3] */ + "LDR lr, [%[b], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[7] * B[4] */ + "LDR lr, [%[b], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[7] * B[5] */ + "LDR lr, [%[b], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * B[6] */ + "LDR lr, [%[b], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[7] * B[7] */ + "LDR lr, [%[b], #28]\n\t" + "ADC r10, %[r], #0x0\n\t" + "UMLAL r9, r10, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" /* Start Reduction */ - "ldr r4, [sp, #0]\n\t" - "ldr r5, [sp, #4]\n\t" - "ldr r6, [sp, #8]\n\t" - "ldr r8, [sp, #12]\n\t" - "ldr r9, [sp, #16]\n\t" - "ldr r10, [sp, #20]\n\t" - "ldr r11, [sp, #24]\n\t" - "ldr r14, [sp, #28]\n\t" + "LDM sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "MOV r3, r11\n\t" + "MOV r4, r12\n\t" /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ /* - a[0] << 224 */ /* + (a[0]-a[1] * 2) << (6 * 32) */ - "adds r11, r11, r4\n\t" - "adc r14, r14, r5\n\t" - "adds r11, r11, r4\n\t" - "adc r14, r14, r5\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" /* - a[0] << (7 * 32) */ - "sub r14, r14, r4\n\t" + "SUB r12, r12, r5\n\t" /* + a[0]-a[4] << (3 * 32) */ - "mov %[a], r8\n\t" - "mov %[b], r9\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, %[a]\n\t" - "adc r14, r14, %[b]\n\t" - "str r4, [sp, #0]\n\t" - "str r5, [sp, #4]\n\t" - "str r6, [sp, #8]\n\t" - "str r8, [sp, #12]\n\t" - "str r9, [sp, #16]\n\t" - "str r10, [sp, #20]\n\t" + "MOV r0, r8\n\t" + "MOV r1, r9\n\t" + "MOV r2, r10\n\t" + "ADDS r8, r8, r5\n\t" + "ADCS r9, r9, r6\n\t" + "ADCS r10, r10, r7\n\t" + "ADCS r11, r11, r0\n\t" + "ADC r12, r12, r1\n\t" /* a += mu * m */ /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ - "mov %[a], #0\n\t" - /* a[6] += t[0] + t[3] */ - "ldr r3, [sp, #24]\n\t" - "adds r3, r3, r4\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r8\n\t" - "adc %[b], %[b], #0\n\t" - "str r11, [sp, #24]\n\t" - /* a[7] += t[1] + t[4] */ - "ldr r3, [sp, #28]\n\t" - "adds r3, r3, %[b]\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r5\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r9\n\t" - "adc %[b], %[b], #0\n\t" - "str r14, [sp, #28]\n\t" - "str r3, [sp, #64]\n\t" - /* a[8] += t[0] + t[2] + t[5] */ - "ldr r3, [sp, #32]\n\t" - "adds r3, r3, %[b]\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r4\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r6\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r10\n\t" - "adc %[b], %[b], #0\n\t" - "str r3, [sp, #32]\n\t" - /* a[9] += t[1] + t[3] + t[6] */ - /* a[10] += t[2] + t[4] + t[7] */ - "ldr r3, [sp, #36]\n\t" - "ldr r4, [sp, #40]\n\t" - "adds r3, r3, %[b]\n\t" - "adcs r4, r4, #0\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r5\n\t" - "adcs r4, r4, r6\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r11\n\t" - "adcs r4, r4, r14\n\t" - "adc %[b], %[b], #0\n\t" - "str r3, [sp, #36]\n\t" - "str r4, [sp, #40]\n\t" - /* a[11] += t[3] + t[5] */ - /* a[12] += t[4] + t[6] */ - /* a[13] += t[5] + t[7] */ - /* a[14] += t[6] */ - "ldr r3, [sp, #44]\n\t" - "ldr r4, [sp, #48]\n\t" - "ldr r5, [sp, #52]\n\t" - "ldr r6, [sp, #56]\n\t" - "adds r3, r3, %[b]\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r11\n\t" - "adc %[b], %[b], #0\n\t" - "adds r3, r3, r10\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, #0\n\t" - "adc %[b], %[b], #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - /* a[15] += t[7] */ - "ldr r3, [sp, #60]\n\t" - "adds r3, r3, %[b]\n\t" - "adc %[b], %[a], #0\n\t" - "adds r3, r3, r14\n\t" - "adc %[b], %[b], #0\n\t" - "str r3, [sp, #60]\n\t" - "ldr r3, [sp, #64]\n\t" - "ldr r4, [sp, #32]\n\t" - "ldr r5, [sp, #36]\n\t" - "ldr r6, [sp, #40]\n\t" - "ldr r9, [sp, #0]\n\t" - "ldr r10, [sp, #4]\n\t" - "ldr r11, [sp, #8]\n\t" - "ldr r14, [sp, #12]\n\t" - "subs r3, r3, r9\n\t" - "sbcs r4, r4, r10\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r14\n\t" - "str r4, [sp, #32]\n\t" - "str r5, [sp, #36]\n\t" - "str r6, [sp, #40]\n\t" - "ldr r3, [sp, #44]\n\t" - "ldr r4, [sp, #48]\n\t" - "ldr r5, [sp, #52]\n\t" - "ldr r6, [sp, #56]\n\t" - "ldr r8, [sp, #60]\n\t" - "ldr r9, [sp, #16]\n\t" - "ldr r10, [sp, #20]\n\t" - "ldr r11, [sp, #24]\n\t" - "ldr r14, [sp, #28]\n\t" - "sbcs r3, r3, r9\n\t" - "sbcs r4, r4, r10\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r14\n\t" - "sbc r8, r8, #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - "str r8, [sp, #60]\n\t" + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "ADDS r0, r0, r5\n\t" + "ADCS r1, r1, r6\n\t" + "ADCS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "ADD r0, sp, #0x20\n\t" + "LDM r0, {r2, r3, r4}\n\t" + "ADDS r2, r2, lr\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "LDM r0, {r0, r1, r2, r3, r4}\n\t" + "ADDS r0, r0, lr\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r8\n\t" + "ADCS r1, r1, r9\n\t" + "ADCS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r10\n\t" + "ADCS r1, r1, r11\n\t" + "ADCS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r0, [sp, #44]\n\t" + "STR r1, [sp, #48]\n\t" + "STR r2, [sp, #52]\n\t" + "STR r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "ADD r0, sp, #0x1c\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "SUBS r0, r0, r5\n\t" + "SBCS r1, r1, r6\n\t" + "SBCS r2, r2, r7\n\t" + "SBCS r3, r3, r8\n\t" + "ADD r0, sp, #0x2c\n\t" + "MOV r8, r4\n\t" + "LDM r0, {r4, r5, r6, r7}\n\t" + "SBCS r4, r4, r9\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, r11\n\t" + "SBCS r7, r7, r12\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" /* mask m and sub from result if overflow */ - "sub %[b], %[a], %[b]\n\t" - "and %[a], %[b], #1\n\t" - "ldr r3, [sp, #32]\n\t" - "ldr r4, [sp, #36]\n\t" - "ldr r5, [sp, #40]\n\t" - "ldr r6, [sp, #44]\n\t" - "ldr r8, [sp, #48]\n\t" - "ldr r9, [sp, #52]\n\t" - "ldr r10, [sp, #56]\n\t" - "ldr r11, [sp, #60]\n\t" - "subs r3, r3, %[b]\n\t" - "sbcs r4, r4, %[b]\n\t" - "sbcs r5, r5, %[b]\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, %[a]\n\t" - "sbc r11, r11, %[b]\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" - "add sp, sp, #68\n\t" - : [a] "+r" (a), [b] "+r" (b) - : [r] "r" (r) - : "memory", "r9", "r10", "r11", "r14", "r3", "r4", "r5", "r6", "r8" + "RSB lr, lr, #0x0\n\t" + "SUBS r1, r1, lr\n\t" + "SBCS r2, r2, lr\n\t" + "SBCS r3, r3, lr\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, lr, LSR #31\n\t" + "SBC r8, r8, lr\n\t" + "LDR %[r], [sp, #64]\n\t" + "STM %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); + (void)m_p; + (void)mp_p; } +#else +/* Multiply two Montgomery form numbers mod the modulus (prime). + * (r = a * b mod m) + * + * r Result of multiplication. + * a First number to multiply in Montgomery form. + * b Second number to multiply in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static void sp_256_mont_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x4c\n\t" + "STRD %[r], %[a], [sp, #68]\n\t" + "MOV lr, %[b]\n\t" + "LDM %[a], {%[r], %[a], %[b], r3}\n\t" + "LDM lr!, {r4, r5, r6}\n\t" + "UMULL r10, r11, %[r], r4\n\t" + "UMULL r12, r7, %[a], r4\n\t" + "UMAAL r11, r12, %[r], r5\n\t" + "UMULL r8, r9, %[b], r4\n\t" + "UMAAL r12, r8, %[a], r5\n\t" + "UMAAL r12, r7, %[r], r6\n\t" + "UMAAL r8, r9, r3, r4\n\t" + "STM sp, {r10, r11, r12}\n\t" + "UMAAL r7, r8, %[b], r5\n\t" + "LDM lr!, {r4}\n\t" + "UMULL r10, r11, %[a], r6\n\t" + "UMAAL r8, r9, %[b], r6\n\t" + "UMAAL r7, r10, %[r], r4\n\t" + "UMAAL r8, r11, r3, r5\n\t" + "STR r7, [sp, #12]\n\t" + "UMAAL r8, r10, %[a], r4\n\t" + "UMAAL r9, r11, r3, r6\n\t" + "UMAAL r9, r10, %[b], r4\n\t" + "UMAAL r10, r11, r3, r4\n\t" + "LDM lr, {r4, r5, r6, r7}\n\t" + "MOV r12, #0x0\n\t" + "UMLAL r8, r12, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r10, r12, %[b], r4\n\t" + "UMAAL r11, r12, r3, r4\n\t" + "MOV r4, #0x0\n\t" + "UMLAL r9, r4, %[r], r5\n\t" + "UMAAL r10, r4, %[a], r5\n\t" + "UMAAL r11, r4, %[b], r5\n\t" + "UMAAL r12, r4, r3, r5\n\t" + "MOV r5, #0x0\n\t" + "UMLAL r10, r5, %[r], r6\n\t" + "UMAAL r11, r5, %[a], r6\n\t" + "UMAAL r12, r5, %[b], r6\n\t" + "UMAAL r4, r5, r3, r6\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r11, r6, %[r], r7\n\t" + "LDR %[r], [sp, #72]\n\t" + "UMAAL r12, r6, %[a], r7\n\t" + "ADD %[r], %[r], #0x10\n\t" + "UMAAL r4, r6, %[b], r7\n\t" + "SUB lr, lr, #0x10\n\t" + "UMAAL r5, r6, r3, r7\n\t" + "LDM %[r], {%[r], %[a], %[b], r3}\n\t" + "STR r6, [sp, #64]\n\t" + "LDM lr!, {r6}\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r8, r7, %[r], r6\n\t" + "UMAAL r9, r7, %[a], r6\n\t" + "STR r8, [sp, #16]\n\t" + "UMAAL r10, r7, %[b], r6\n\t" + "UMAAL r11, r7, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r8, #0x0\n\t" + "UMLAL r9, r8, %[r], r6\n\t" + "UMAAL r10, r8, %[a], r6\n\t" + "STR r9, [sp, #20]\n\t" + "UMAAL r11, r8, %[b], r6\n\t" + "UMAAL r12, r8, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r10, r9, %[r], r6\n\t" + "UMAAL r11, r9, %[a], r6\n\t" + "STR r10, [sp, #24]\n\t" + "UMAAL r12, r9, %[b], r6\n\t" + "UMAAL r4, r9, r3, r6\n\t" + "LDM lr!, {r6}\n\t" + "MOV r10, #0x0\n\t" + "UMLAL r11, r10, %[r], r6\n\t" + "UMAAL r12, r10, %[a], r6\n\t" + "STR r11, [sp, #28]\n\t" + "UMAAL r4, r10, %[b], r6\n\t" + "UMAAL r5, r10, r3, r6\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r12, r7, %[r], r11\n\t" + "UMAAL r4, r7, %[a], r11\n\t" + "LDR r6, [sp, #64]\n\t" + "UMAAL r5, r7, %[b], r11\n\t" + "UMAAL r6, r7, r3, r11\n\t" + "LDM lr!, {r11}\n\t" + "UMAAL r4, r8, %[r], r11\n\t" + "UMAAL r5, r8, %[a], r11\n\t" + "UMAAL r6, r8, %[b], r11\n\t" + "UMAAL r7, r8, r3, r11\n\t" + "LDM lr, {r11, lr}\n\t" + "UMAAL r5, r9, %[r], r11\n\t" + "UMAAL r6, r10, %[r], lr\n\t" + "UMAAL r6, r9, %[a], r11\n\t" + "UMAAL r7, r10, %[a], lr\n\t" + "UMAAL r7, r9, %[b], r11\n\t" + "UMAAL r8, r10, %[b], lr\n\t" + "UMAAL r8, r9, r3, r11\n\t" + "UMAAL r9, r10, r3, lr\n\t" + "MOV r3, r12\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* Start Reduction */ + "LDM sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "MOV r3, r11\n\t" + "MOV r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "SUB r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "MOV r0, r8\n\t" + "MOV r1, r9\n\t" + "MOV r2, r10\n\t" + "ADDS r8, r8, r5\n\t" + "ADCS r9, r9, r6\n\t" + "ADCS r10, r10, r7\n\t" + "ADCS r11, r11, r0\n\t" + "ADC r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "ADDS r0, r0, r5\n\t" + "ADCS r1, r1, r6\n\t" + "ADCS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "ADD r0, sp, #0x20\n\t" + "LDM r0, {r2, r3, r4}\n\t" + "ADDS r2, r2, lr\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "LDM r0, {r0, r1, r2, r3, r4}\n\t" + "ADDS r0, r0, lr\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r8\n\t" + "ADCS r1, r1, r9\n\t" + "ADCS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r10\n\t" + "ADCS r1, r1, r11\n\t" + "ADCS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r0, [sp, #44]\n\t" + "STR r1, [sp, #48]\n\t" + "STR r2, [sp, #52]\n\t" + "STR r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "ADD r0, sp, #0x1c\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "SUBS r0, r0, r5\n\t" + "SBCS r1, r1, r6\n\t" + "SBCS r2, r2, r7\n\t" + "SBCS r3, r3, r8\n\t" + "ADD r0, sp, #0x2c\n\t" + "MOV r8, r4\n\t" + "LDM r0, {r4, r5, r6, r7}\n\t" + "SBCS r4, r4, r9\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, r11\n\t" + "SBCS r7, r7, r12\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" + /* mask m and sub from result if overflow */ + "RSB lr, lr, #0x0\n\t" + "SUBS r1, r1, lr\n\t" + "SBCS r2, r2, lr\n\t" + "SBCS r3, r3, lr\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, lr, LSR #31\n\t" + "SBC r8, r8, lr\n\t" + "LDR %[r], [sp, #68]\n\t" + "STM %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x4c\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r10", "r11", "r12", "r7", "r8", "r9", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#endif +#ifdef WOLFSSL_SP_NO_UMAAL /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) * * r Result of squaring. @@ -17862,515 +31460,600 @@ SP_NOINLINE static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const * m Modulus (prime). * mp Montgomery multiplier. */ -SP_NOINLINE static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - (void)mp; - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "sub sp, sp, #68\n\t" - "mov r5, #0\n\t" - /* A[0] * A[1] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #4]\n\t" - "umull r10, r11, r6, r8\n\t" - "str r10, [sp, #4]\n\t" - /* A[0] * A[2] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adc r14, r4, #0\n\t" - "str r11, [sp, #8]\n\t" - /* A[0] * A[3] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adc r9, r4, #0\n\t" - /* A[1] * A[2] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #8]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - "str r14, [sp, #12]\n\t" - /* A[0] * A[4] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adc r10, r4, r10\n\t" - /* A[1] * A[3] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - "str r9, [sp, #16]\n\t" - /* A[0] * A[5] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adc r11, r4, r11\n\t" - /* A[1] * A[4] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - /* A[2] * A[3] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #12]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - "str r10, [sp, #20]\n\t" - /* A[0] * A[6] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, #0\n\t" - /* A[1] * A[5] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - /* A[2] * A[4] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - "str r11, [sp, #24]\n\t" - /* A[0] * A[7] */ - "ldr r6, [%[a], #0]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - /* A[1] * A[6] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[2] * A[5] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - /* A[3] * A[4] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #16]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "str r14, [sp, #28]\n\t" - /* A[1] * A[7] */ - "ldr r6, [%[a], #4]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - /* A[2] * A[6] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - /* A[3] * A[5] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, r11\n\t" - "str r9, [sp, #32]\n\t" - /* A[2] * A[7] */ - "ldr r6, [%[a], #8]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, #0\n\t" - /* A[3] * A[6] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - /* A[4] * A[5] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #20]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adcs r11, r4, r11\n\t" - "adc r14, r5, r14\n\t" - "str r10, [sp, #36]\n\t" - /* A[3] * A[7] */ - "ldr r6, [%[a], #12]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, #0\n\t" - /* A[4] * A[6] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r11, r3, r11\n\t" - "adcs r14, r4, r14\n\t" - "adc r9, r5, r9\n\t" - "str r11, [sp, #40]\n\t" - /* A[4] * A[7] */ - "ldr r6, [%[a], #16]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, #0\n\t" - /* A[5] * A[6] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #24]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r14, r3, r14\n\t" - "adcs r9, r4, r9\n\t" - "adc r10, r5, r10\n\t" - "str r14, [sp, #44]\n\t" - /* A[5] * A[7] */ - "ldr r6, [%[a], #20]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r9, r3, r9\n\t" - "adcs r10, r4, r10\n\t" - "adc r11, r5, #0\n\t" - "str r9, [sp, #48]\n\t" - /* A[6] * A[7] */ - "ldr r6, [%[a], #24]\n\t" - "ldr r8, [%[a], #28]\n\t" - "umull r3, r4, r6, r8\n\t" - "adds r10, r3, r10\n\t" - "adc r11, r4, r11\n\t" - "str r10, [sp, #52]\n\t" - "str r11, [sp, #56]\n\t" - /* Double */ - "ldr r4, [sp, #4]\n\t" - "ldr r6, [sp, #8]\n\t" - "ldr r8, [sp, #12]\n\t" - "ldr r9, [sp, #16]\n\t" - "ldr r10, [sp, #20]\n\t" - "ldr r11, [sp, #24]\n\t" - "ldr r14, [sp, #28]\n\t" - "ldr r12, [sp, #32]\n\t" - "ldr r3, [sp, #36]\n\t" - "adds r4, r4, r4\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r11, r11, r11\n\t" - "adcs r14, r14, r14\n\t" - "adcs r12, r12, r12\n\t" - "adcs r3, r3, r3\n\t" - "str r4, [sp, #4]\n\t" - "str r6, [sp, #8]\n\t" - "str r8, [sp, #12]\n\t" - "str r9, [sp, #16]\n\t" - "str r10, [sp, #20]\n\t" - "str r11, [sp, #24]\n\t" - "str r14, [sp, #28]\n\t" - "str r12, [sp, #32]\n\t" - "str r3, [sp, #36]\n\t" - "ldr r4, [sp, #40]\n\t" - "ldr r6, [sp, #44]\n\t" - "ldr r8, [sp, #48]\n\t" - "ldr r9, [sp, #52]\n\t" - "ldr r10, [sp, #56]\n\t" - "adcs r4, r4, r4\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "str r4, [sp, #40]\n\t" - "str r6, [sp, #44]\n\t" - "str r8, [sp, #48]\n\t" - "str r9, [sp, #52]\n\t" - "str r10, [sp, #56]\n\t" - "adc r11, r5, #0\n\t" - "str r11, [sp, #60]\n\t" - "ldr r4, [sp, #4]\n\t" - "ldr r5, [sp, #8]\n\t" - "ldr r12, [sp, #12]\n\t" - /* A[0] * A[0] */ - "ldr r6, [%[a], #0]\n\t" - "umull r9, r10, r6, r6\n\t" - /* A[1] * A[1] */ - "ldr r6, [%[a], #4]\n\t" - "umull r11, r14, r6, r6\n\t" - "adds r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adcs r14, r14, r12\n\t" - "str r9, [sp, #0]\n\t" - "str r10, [sp, #4]\n\t" - "str r11, [sp, #8]\n\t" - "str r14, [sp, #12]\n\t" - "ldr r3, [sp, #16]\n\t" - "ldr r4, [sp, #20]\n\t" - "ldr r5, [sp, #24]\n\t" - "ldr r12, [sp, #28]\n\t" - /* A[2] * A[2] */ - "ldr r6, [%[a], #8]\n\t" - "umull r9, r10, r6, r6\n\t" - /* A[3] * A[3] */ - "ldr r6, [%[a], #12]\n\t" - "umull r11, r14, r6, r6\n\t" - "adcs r9, r9, r3\n\t" - "adcs r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adcs r14, r14, r12\n\t" - "str r9, [sp, #16]\n\t" - "str r10, [sp, #20]\n\t" - "str r11, [sp, #24]\n\t" - "str r14, [sp, #28]\n\t" - "ldr r3, [sp, #32]\n\t" - "ldr r4, [sp, #36]\n\t" - "ldr r5, [sp, #40]\n\t" - "ldr r12, [sp, #44]\n\t" - /* A[4] * A[4] */ - "ldr r6, [%[a], #16]\n\t" - "umull r9, r10, r6, r6\n\t" - /* A[5] * A[5] */ - "ldr r6, [%[a], #20]\n\t" - "umull r11, r14, r6, r6\n\t" - "adcs r9, r9, r3\n\t" - "adcs r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adcs r14, r14, r12\n\t" - "str r9, [sp, #32]\n\t" - "str r10, [sp, #36]\n\t" - "str r11, [sp, #40]\n\t" - "str r14, [sp, #44]\n\t" - "ldr r3, [sp, #48]\n\t" - "ldr r4, [sp, #52]\n\t" - "ldr r5, [sp, #56]\n\t" - "ldr r12, [sp, #60]\n\t" - /* A[6] * A[6] */ - "ldr r6, [%[a], #24]\n\t" - "umull r9, r10, r6, r6\n\t" - /* A[7] * A[7] */ - "ldr r6, [%[a], #28]\n\t" - "umull r11, r14, r6, r6\n\t" - "adcs r9, r9, r3\n\t" - "adcs r10, r10, r4\n\t" - "adcs r11, r11, r5\n\t" - "adc r14, r14, r12\n\t" - "str r9, [sp, #48]\n\t" - "str r10, [sp, #52]\n\t" - "str r11, [sp, #56]\n\t" - "str r14, [sp, #60]\n\t" + "SUB sp, sp, #0x44\n\t" + "STR %[r], [sp, #64]\n\t" + "MOV %[r], #0x0\n\t" + "LDR r12, [%[a]]\n\t" + /* A[0] * A[1] */ + "LDR lr, [%[a], #4]\n\t" + "UMULL r4, r5, r12, lr\n\t" + /* A[0] * A[3] */ + "LDR lr, [%[a], #12]\n\t" + "UMULL r6, r7, r12, lr\n\t" + /* A[0] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "UMULL r8, r9, r12, lr\n\t" + /* A[0] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "UMULL r10, r3, r12, lr\n\t" + /* A[0] * A[2] */ + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[0] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "ADDS r8, r8, r11\n\t" + /* A[0] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + "ADCS r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + "STR r5, [sp, #8]\n\t" + /* A[1] * A[2] */ + "LDR r12, [%[a], #4]\n\t" + "LDR lr, [%[a], #8]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "STR r6, [sp, #12]\n\t" + "ADDS r7, r7, r11\n\t" + /* A[1] * A[3] */ + "LDR lr, [%[a], #12]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, lr\n\t" + "STR r7, [sp, #16]\n\t" + "ADDS r8, r8, r11\n\t" + /* A[1] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "ADDS r9, r9, r11\n\t" + /* A[1] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "ADDS r10, r10, r11\n\t" + /* A[1] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[1] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r4, %[r], #0x0\n\t" + "UMLAL r3, r4, r12, lr\n\t" + /* A[2] * A[3] */ + "LDR r12, [%[a], #8]\n\t" + "LDR lr, [%[a], #12]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r8, r11, r12, lr\n\t" + "STR r8, [sp, #20]\n\t" + "ADDS r9, r9, r11\n\t" + /* A[2] * A[4] */ + "LDR lr, [%[a], #16]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, lr\n\t" + "STR r9, [sp, #24]\n\t" + "ADDS r10, r10, r11\n\t" + /* A[2] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "ADDS r3, r3, r11\n\t" + /* A[2] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[2] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r5, %[r], #0x0\n\t" + "UMLAL r4, r5, r12, lr\n\t" + /* A[3] * A[4] */ + "LDR r12, [%[a], #12]\n\t" + "LDR lr, [%[a], #16]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r10, r11, r12, lr\n\t" + "STR r10, [sp, #28]\n\t" + "ADDS r3, r3, r11\n\t" + /* A[3] * A[5] */ + "LDR lr, [%[a], #20]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, lr\n\t" + "ADDS r4, r4, r11\n\t" + /* A[3] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[3] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r6, %[r], #0x0\n\t" + "UMLAL r5, r6, r12, lr\n\t" + /* A[4] * A[5] */ + "LDR r12, [%[a], #16]\n\t" + "LDR lr, [%[a], #20]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r4, r11, r12, lr\n\t" + "ADDS r5, r5, r11\n\t" + /* A[4] * A[6] */ + "LDR lr, [%[a], #24]\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, lr\n\t" + "ADDS r6, r6, r11\n\t" + /* A[4] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r7, %[r], #0x0\n\t" + "UMLAL r6, r7, r12, lr\n\t" + /* A[5] * A[6] */ + "LDR r12, [%[a], #20]\n\t" + "LDR lr, [%[a], #24]\n\t" + "MOV r11, #0x0\n\t" + "UMLAL r6, r11, r12, lr\n\t" + "ADDS r7, r7, r11\n\t" + /* A[5] * A[7] */ + "LDR lr, [%[a], #28]\n\t" + "ADC r8, %[r], #0x0\n\t" + "UMLAL r7, r8, r12, lr\n\t" + /* A[6] * A[7] */ + "LDR r12, [%[a], #24]\n\t" + "LDR lr, [%[a], #28]\n\t" + "MOV r9, #0x0\n\t" + "UMLAL r8, r9, r12, lr\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "STM lr!, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADCS r3, r3, r3\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADC r10, %[r], #0x0\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "ADD lr, sp, #0x4\n\t" + "LDM lr, {r4, r5, r6, r7, r8, r9, r10}\n\t" + "MOV lr, sp\n\t" + /* A[0] * A[0] */ + "LDR r12, [%[a]]\n\t" + "UMULL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[1] * A[1] */ + "LDR r12, [%[a], #4]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[2] * A[2] */ + "LDR r12, [%[a], #8]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[3] * A[3] */ + "LDR r12, [%[a], #12]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r9, r11, r12, r12\n\t" + "ADDS r10, r10, r11\n\t" + "STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + "LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" + /* A[4] * A[4] */ + "LDR r12, [%[a], #16]\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r3, r11, r12, r12\n\t" + "ADDS r4, r4, r11\n\t" + /* A[5] * A[5] */ + "LDR r12, [%[a], #20]\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r5, r11, r12, r12\n\t" + "ADDS r6, r6, r11\n\t" + /* A[6] * A[6] */ + "LDR r12, [%[a], #24]\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADC r11, %[r], #0x0\n\t" + "UMLAL r7, r11, r12, r12\n\t" + "ADDS r8, r8, r11\n\t" + /* A[7] * A[7] */ + "LDR r12, [%[a], #28]\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADC r10, r10, #0x0\n\t" + "UMLAL r9, r10, r12, r12\n\t" + "ADD lr, sp, #0x20\n\t" + "STM lr, {r3, r4, r5, r6, r7, r8, r9, r10}\n\t" /* Start Reduction */ - "ldr r4, [sp, #0]\n\t" - "ldr r5, [sp, #4]\n\t" - "ldr r6, [sp, #8]\n\t" - "ldr r8, [sp, #12]\n\t" - "ldr r9, [sp, #16]\n\t" - "ldr r10, [sp, #20]\n\t" - "ldr r11, [sp, #24]\n\t" - "ldr r14, [sp, #28]\n\t" + "LDM sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "MOV r3, r11\n\t" + "MOV r4, r12\n\t" /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ /* - a[0] << 224 */ /* + (a[0]-a[1] * 2) << (6 * 32) */ - "adds r11, r11, r4\n\t" - "adc r14, r14, r5\n\t" - "adds r11, r11, r4\n\t" - "adc r14, r14, r5\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" /* - a[0] << (7 * 32) */ - "sub r14, r14, r4\n\t" + "SUB r12, r12, r5\n\t" /* + a[0]-a[4] << (3 * 32) */ - "mov %[a], r8\n\t" - "mov r12, r9\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r11, r11, %[a]\n\t" - "adc r14, r14, r12\n\t" - "str r4, [sp, #0]\n\t" - "str r5, [sp, #4]\n\t" - "str r6, [sp, #8]\n\t" - "str r8, [sp, #12]\n\t" - "str r9, [sp, #16]\n\t" - "str r10, [sp, #20]\n\t" + "MOV r0, r8\n\t" + "MOV r1, r9\n\t" + "MOV r2, r10\n\t" + "ADDS r8, r8, r5\n\t" + "ADCS r9, r9, r6\n\t" + "ADCS r10, r10, r7\n\t" + "ADCS r11, r11, r0\n\t" + "ADC r12, r12, r1\n\t" /* a += mu * m */ /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ - "mov %[a], #0\n\t" - /* a[6] += t[0] + t[3] */ - "ldr r3, [sp, #24]\n\t" - "adds r3, r3, r4\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r8\n\t" - "adc r12, r12, #0\n\t" - "str r11, [sp, #24]\n\t" - /* a[7] += t[1] + t[4] */ - "ldr r3, [sp, #28]\n\t" - "adds r3, r3, r12\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r5\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r9\n\t" - "adc r12, r12, #0\n\t" - "str r14, [sp, #28]\n\t" - "str r3, [sp, #64]\n\t" - /* a[8] += t[0] + t[2] + t[5] */ - "ldr r3, [sp, #32]\n\t" - "adds r3, r3, r12\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r4\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r6\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r10\n\t" - "adc r12, r12, #0\n\t" - "str r3, [sp, #32]\n\t" - /* a[9] += t[1] + t[3] + t[6] */ - /* a[10] += t[2] + t[4] + t[7] */ - "ldr r3, [sp, #36]\n\t" - "ldr r4, [sp, #40]\n\t" - "adds r3, r3, r12\n\t" - "adcs r4, r4, #0\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r5\n\t" - "adcs r4, r4, r6\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r11\n\t" - "adcs r4, r4, r14\n\t" - "adc r12, r12, #0\n\t" - "str r3, [sp, #36]\n\t" - "str r4, [sp, #40]\n\t" - /* a[11] += t[3] + t[5] */ - /* a[12] += t[4] + t[6] */ - /* a[13] += t[5] + t[7] */ - /* a[14] += t[6] */ - "ldr r3, [sp, #44]\n\t" - "ldr r4, [sp, #48]\n\t" - "ldr r5, [sp, #52]\n\t" - "ldr r6, [sp, #56]\n\t" - "adds r3, r3, r12\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, #0\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r8\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r11\n\t" - "adc r12, r12, #0\n\t" - "adds r3, r3, r10\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, #0\n\t" - "adc r12, r12, #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - /* a[15] += t[7] */ - "ldr r3, [sp, #60]\n\t" - "adds r3, r3, r12\n\t" - "adc r12, %[a], #0\n\t" - "adds r3, r3, r14\n\t" - "adc r12, r12, #0\n\t" - "str r3, [sp, #60]\n\t" - "ldr r3, [sp, #64]\n\t" - "ldr r4, [sp, #32]\n\t" - "ldr r5, [sp, #36]\n\t" - "ldr r6, [sp, #40]\n\t" - "ldr r9, [sp, #0]\n\t" - "ldr r10, [sp, #4]\n\t" - "ldr r11, [sp, #8]\n\t" - "ldr r14, [sp, #12]\n\t" - "subs r3, r3, r9\n\t" - "sbcs r4, r4, r10\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r14\n\t" - "str r4, [sp, #32]\n\t" - "str r5, [sp, #36]\n\t" - "str r6, [sp, #40]\n\t" - "ldr r3, [sp, #44]\n\t" - "ldr r4, [sp, #48]\n\t" - "ldr r5, [sp, #52]\n\t" - "ldr r6, [sp, #56]\n\t" - "ldr r8, [sp, #60]\n\t" - "ldr r9, [sp, #16]\n\t" - "ldr r10, [sp, #20]\n\t" - "ldr r11, [sp, #24]\n\t" - "ldr r14, [sp, #28]\n\t" - "sbcs r3, r3, r9\n\t" - "sbcs r4, r4, r10\n\t" - "sbcs r5, r5, r11\n\t" - "sbcs r6, r6, r14\n\t" - "sbc r8, r8, #0\n\t" - "str r3, [sp, #44]\n\t" - "str r4, [sp, #48]\n\t" - "str r5, [sp, #52]\n\t" - "str r6, [sp, #56]\n\t" - "str r8, [sp, #60]\n\t" + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "ADDS r0, r0, r5\n\t" + "ADCS r1, r1, r6\n\t" + "ADCS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "ADD r0, sp, #0x20\n\t" + "LDM r0, {r2, r3, r4}\n\t" + "ADDS r2, r2, lr\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "LDM r0, {r0, r1, r2, r3, r4}\n\t" + "ADDS r0, r0, lr\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r8\n\t" + "ADCS r1, r1, r9\n\t" + "ADCS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r10\n\t" + "ADCS r1, r1, r11\n\t" + "ADCS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r0, [sp, #44]\n\t" + "STR r1, [sp, #48]\n\t" + "STR r2, [sp, #52]\n\t" + "STR r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "ADD r0, sp, #0x1c\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "SUBS r0, r0, r5\n\t" + "SBCS r1, r1, r6\n\t" + "SBCS r2, r2, r7\n\t" + "SBCS r3, r3, r8\n\t" + "ADD r0, sp, #0x2c\n\t" + "MOV r8, r4\n\t" + "LDM r0, {r4, r5, r6, r7}\n\t" + "SBCS r4, r4, r9\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, r11\n\t" + "SBCS r7, r7, r12\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" /* mask m and sub from result if overflow */ - "sub r12, %[a], r12\n\t" - "and %[a], r12, #1\n\t" - "ldr r3, [sp, #32]\n\t" - "ldr r4, [sp, #36]\n\t" - "ldr r5, [sp, #40]\n\t" - "ldr r6, [sp, #44]\n\t" - "ldr r8, [sp, #48]\n\t" - "ldr r9, [sp, #52]\n\t" - "ldr r10, [sp, #56]\n\t" - "ldr r11, [sp, #60]\n\t" - "subs r3, r3, r12\n\t" - "sbcs r4, r4, r12\n\t" - "sbcs r5, r5, r12\n\t" - "sbcs r6, r6, #0\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, %[a]\n\t" - "sbc r11, r11, r12\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "str r8, [%[r], #16]\n\t" - "str r9, [%[r], #20]\n\t" - "str r10, [%[r], #24]\n\t" - "str r11, [%[r], #28]\n\t" - "add sp, sp, #68\n\t" - : [a] "+r" (a) - : [r] "r" (r) - : "memory", "r9", "r10", "r11", "r14", "r3", "r4", "r5", "r6", "r8", "r12" + "RSB lr, lr, #0x0\n\t" + "SUBS r1, r1, lr\n\t" + "SBCS r2, r2, lr\n\t" + "SBCS r3, r3, lr\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, lr, LSR #31\n\t" + "SBC r8, r8, lr\n\t" + "LDR %[r], [sp, #64]\n\t" + "STM %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); + (void)m_p; + (void)mp_p; } +#else +/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m) + * + * r Result of squaring. + * a Number to square in Montgomery form. + * m Modulus (prime). + * mp Montgomery multiplier. + */ +static void sp_256_mont_sqr_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + "STR %[r], [sp, #64]\n\t" + "LDM %[a], {%[r], %[a], r2, r3, r4, r5, r6, r7}\n\t" + "UMULL r9, r10, %[r], %[r]\n\t" + "UMULL r11, r12, %[r], %[a]\n\t" + "ADDS r11, r11, r11\n\t" + "MOV lr, #0x0\n\t" + "UMAAL r10, r11, lr, lr\n\t" + "STM sp, {r9, r10}\n\t" + "MOV r8, lr\n\t" + "UMAAL r8, r12, %[r], r2\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r8, r11, %[a], %[a]\n\t" + "UMULL r9, r10, %[r], r3\n\t" + "UMAAL r9, r12, %[a], r2\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STRD r8, r9, [sp, #8]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r10, %[r], r4\n\t" + "UMAAL r9, r12, %[a], r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r2, r2\n\t" + "STR r9, [sp, #16]\n\t" + "UMULL r9, r8, %[r], r5\n\t" + "UMAAL r9, r12, %[a], r4\n\t" + "UMAAL r9, r10, r2, r3\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, lr, lr\n\t" + "STR r9, [sp, #20]\n\t" + "MOV r9, lr\n\t" + "UMAAL r9, r8, %[r], r6\n\t" + "UMAAL r9, r12, %[a], r5\n\t" + "UMAAL r9, r10, r2, r4\n\t" + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r11, r3, r3\n\t" + "STR r9, [sp, #24]\n\t" + "UMULL %[r], r9, %[r], r7\n\t" + "UMAAL %[r], r8, %[a], r6\n\t" + "UMAAL %[r], r12, r2, r5\n\t" + "UMAAL %[r], r10, r3, r4\n\t" + "ADCS %[r], %[r], %[r]\n\t" + "UMAAL %[r], r11, lr, lr\n\t" + /* R[7] = r0 */ + "UMAAL r9, r8, %[a], r7\n\t" + "UMAAL r9, r10, r2, r6\n\t" + "UMAAL r12, r9, r3, r5\n\t" + "ADCS r12, r12, r12\n\t" + "UMAAL r12, r11, r4, r4\n\t" + /* R[8] = r12 */ + "UMAAL r9, r8, r2, r7\n\t" + "UMAAL r10, r9, r3, r6\n\t" + "MOV r2, lr\n\t" + "UMAAL r10, r2, r4, r5\n\t" + "ADCS r10, r10, r10\n\t" + "UMAAL r11, r10, lr, lr\n\t" + /* R[9] = r11 */ + "UMAAL r2, r8, r3, r7\n\t" + "UMAAL r2, r9, r4, r6\n\t" + "ADCS r3, r2, r2\n\t" + "UMAAL r10, r3, r5, r5\n\t" + /* R[10] = r10 */ + "MOV %[a], lr\n\t" + "UMAAL %[a], r8, r4, r7\n\t" + "UMAAL %[a], r9, r5, r6\n\t" + "ADCS r4, %[a], %[a]\n\t" + "UMAAL r3, r4, lr, lr\n\t" + /* R[11] = r3 */ + "UMAAL r8, r9, r5, r7\n\t" + "ADCS r8, r8, r8\n\t" + "UMAAL r4, r8, r6, r6\n\t" + /* R[12] = r4 */ + "MOV r5, lr\n\t" + "UMAAL r5, r9, r6, r7\n\t" + "ADCS r5, r5, r5\n\t" + "UMAAL r8, r5, lr, lr\n\t" + /* R[13] = r8 */ + "ADCS r9, r9, r9\n\t" + "UMAAL r9, r5, r7, r7\n\t" + "ADCS r7, r5, lr\n\t" + /* R[14] = r9 */ + /* R[15] = r7 */ + "MOV lr, sp\n\t" + "ADD lr, lr, #0x1c\n\t" + "STM lr!, {%[r], r12}\n\t" + "STM lr!, {r11}\n\t" + "STM lr!, {r10}\n\t" + "STM lr!, {r3, r4, r8, r9}\n\t" + "STM lr!, {r7}\n\t" + /* Start Reduction */ + "LDM sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "MOV r3, r11\n\t" + "MOV r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "SUB r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "MOV r0, r8\n\t" + "MOV r1, r9\n\t" + "MOV r2, r10\n\t" + "ADDS r8, r8, r5\n\t" + "ADCS r9, r9, r6\n\t" + "ADCS r10, r10, r7\n\t" + "ADCS r11, r11, r0\n\t" + "ADC r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "ADDS r0, r0, r5\n\t" + "ADCS r1, r1, r6\n\t" + "ADCS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "ADD r0, sp, #0x20\n\t" + "LDM r0, {r2, r3, r4}\n\t" + "ADDS r2, r2, lr\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "LDM r0, {r0, r1, r2, r3, r4}\n\t" + "ADDS r0, r0, lr\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r8\n\t" + "ADCS r1, r1, r9\n\t" + "ADCS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r10\n\t" + "ADCS r1, r1, r11\n\t" + "ADCS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r0, [sp, #44]\n\t" + "STR r1, [sp, #48]\n\t" + "STR r2, [sp, #52]\n\t" + "STR r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "ADD r0, sp, #0x1c\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "SUBS r0, r0, r5\n\t" + "SBCS r1, r1, r6\n\t" + "SBCS r2, r2, r7\n\t" + "SBCS r3, r3, r8\n\t" + "ADD r0, sp, #0x2c\n\t" + "MOV r8, r4\n\t" + "LDM r0, {r4, r5, r6, r7}\n\t" + "SBCS r4, r4, r9\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, r11\n\t" + "SBCS r7, r7, r12\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" + /* mask m and sub from result if overflow */ + "RSB lr, lr, #0x0\n\t" + "SUBS r1, r1, lr\n\t" + "SBCS r2, r2, lr\n\t" + "SBCS r3, r3, lr\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, lr, LSR #31\n\t" + "SBC r8, r8, lr\n\t" + "LDR %[r], [sp, #64]\n\t" + "STM %[r], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + (void)m_p; + (void)mp_p; +} + +#endif #if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY) /* Square the Montgomery form number a number of times. (r = a ^ n mod m) * @@ -18474,44 +32157,131 @@ static void sp_256_mont_inv_8(sp_digit* r, const sp_digit* a, sp_digit* td) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_256_cmp_8(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #28\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x1c\n\t" + "\n" + "L_sp_256_cmp_8_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_256_cmp_8_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Normalize the values in each word to 32. @@ -18520,6 +32290,7 @@ SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b) */ #define sp_256_norm_8(a) +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -18528,265 +32299,645 @@ SP_NOINLINE static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #32\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_256_cond_sub_8_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_256_cond_sub_8_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_256_cond_sub_8_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_256_cond_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifndef WOLFSSL_SP_SMALL +#define sp_256_mont_reduce_order_8 sp_256_mont_reduce_8 + +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 256 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - (void)mp; - (void)m; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r2, #0\n\t" - "mov r1, #0\n\t" + "LDR lr, [%[m]]\n\t" /* i = 0 */ - "mov r9, r2\n\t" - "\n1:\n\t" - "mov r4, #0\n\t" - /* mu = a[i] * 1 (mp) = a[i] */ - "ldr r3, [%[a]]\n\t" - /* a[i] += -1 * mu = -1 * a[i] => a[i] = 0 no carry */ - /* a[i+1] += -1 * mu */ - "ldr r6, [%[a], #4]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #4]\n\t" - /* a[i+2] += -1 * mu */ - "ldr r6, [%[a], #8]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #8]\n\t" - /* a[i+3] += 0 * mu */ - "ldr r6, [%[a], #12]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r3\n\t" - "adc r5, r5, r2\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #12]\n\t" - /* a[i+4] += 0 * mu */ - "ldr r6, [%[a], #16]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #16]\n\t" - /* a[i+5] += 0 * mu */ - "ldr r6, [%[a], #20]\n\t" - "mov r5, #0\n\t" - "adds r4, r4, r6\n\t" - "adc r5, r5, r2\n\t" - "str r4, [%[a], #20]\n\t" - /* a[i+6] += 1 * mu */ - "ldr r6, [%[a], #24]\n\t" - "mov r4, #0\n\t" - "adds r5, r5, r3\n\t" - "adc r4, r4, r2\n\t" - "adds r5, r5, r6\n\t" - "adc r4, r4, r2\n\t" - "str r5, [%[a], #24]\n\t" - /* a[i+7] += -1 * mu */ - "ldr r6, [%[a], #28]\n\t" - "ldr r8, [%[a], #32]\n\t" - "adds r5, r1, r3\n\t" - "mov r1, #0\n\t" - "adc r1, r1, r2\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r2\n\t" - "sbc r1, r1, r2\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc r1, r1, r2\n\t" - "str r4, [%[a], #28]\n\t" - "str r5, [%[a], #32]\n\t" + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_256_mont_reduce_8_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "LDR r12, [%[a], #32]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r3, r3, #0x0\n\t" /* i += 1 */ - "add r9, r9, #1\n\t" - "add %[a], %[a], #4\n\t" - "mov r6, #8\n\t" - "cmp r9, r6\n\t" + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_256_mont_reduce_8_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "sub %[a], %[a], #32\n\t" - "mov r3, r1\n\t" - "sub r1, r1, #1\n\t" - "mvn r1, r1\n\t" - "ldr r4, [%[a],#32]\n\t" - "ldr r5, [%[a],#36]\n\t" - "ldr r6, [%[a],#40]\n\t" - "ldr r8, [%[a],#44]\n\t" - "ldr r9, [%[a],#48]\n\t" - "ldr r10, [%[a],#52]\n\t" - "ldr r11, [%[a],#56]\n\t" - "ldr r14, [%[a],#60]\n\t" - "subs r4, r4, r1\n\t" - "sbcs r5, r5, r1\n\t" - "sbcs r6, r6, r1\n\t" - "sbcs r8, r8, r2\n\t" - "sbcs r9, r9, r2\n\t" - "sbcs r10, r10, r2\n\t" - "sbcs r11, r11, r3\n\t" - "sbc r14, r14, r1\n\t" - "str r4, [%[a],#0]\n\t" - "str r5, [%[a],#4]\n\t" - "str r6, [%[a],#8]\n\t" - "str r8, [%[a],#12]\n\t" - "str r9, [%[a],#16]\n\t" - "str r10, [%[a],#20]\n\t" - "str r11, [%[a],#24]\n\t" - "str r14, [%[a],#28]\n\t" + "BLT.N L_sp_256_mont_reduce_8_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#else +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_256_mont_reduce_8_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #32]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #28]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #32]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x20\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_256_mont_reduce_8_word_%=\n\t" +#else + "BLT.N L_sp_256_mont_reduce_8_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#endif +#else +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_256_mont_reduce_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + "STR %[a], [sp, #64]\n\t" + "MOV lr, sp\n\t" + "LDM %[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "STM lr!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "LDM %[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "STM lr, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + /* Start Reduction */ + "LDM sp, {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "MOV r3, r11\n\t" + "MOV r4, r12\n\t" + /* mu = a[0]-a[7] + a[0]-a[4] << 96 + (a[0]-a[1] * 2) << 192 */ + /* - a[0] << 224 */ + /* + (a[0]-a[1] * 2) << (6 * 32) */ + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + "ADDS r11, r11, r5\n\t" + "ADC r12, r12, r6\n\t" + /* - a[0] << (7 * 32) */ + "SUB r12, r12, r5\n\t" + /* + a[0]-a[4] << (3 * 32) */ + "MOV r0, r8\n\t" + "MOV r1, r9\n\t" + "MOV r2, r10\n\t" + "ADDS r8, r8, r5\n\t" + "ADCS r9, r9, r6\n\t" + "ADCS r10, r10, r7\n\t" + "ADCS r11, r11, r0\n\t" + "ADC r12, r12, r1\n\t" + /* a += mu * m */ + /* += mu * ((1 << 256) - (1 << 224) + (1 << 192) + (1 << 96) - 1) */ + /* a[0] = = t[0] */ + /* a[1] = = t[1] */ + /* a[2] = = t[2] */ + /* a[3] += t[0] = t[3] */ + /* a[4] += t[1] = t[4] */ + /* a[5] += t[2] = t[5] */ + /* a[6] += t[0] + t[3] = t[6] */ + /* a[7] += t[1] + t[4] = t[7] + t[0] */ + "ADDS r0, r0, r5\n\t" + "ADCS r1, r1, r6\n\t" + "ADCS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* a[8] += t[0] + t[2] + t[5] */ + /* a[9] += t[1] + t[3] + t[6] */ + /* a[10] += t[2] + t[4] + t[7] */ + "ADD r0, sp, #0x20\n\t" + "LDM r0, {r2, r3, r4}\n\t" + "ADDS r2, r2, lr\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r7\n\t" + "ADCS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "STM r0!, {r2, r3, r4}\n\t" + /* a[11] += t[3] + t[5] + carry */ + /* a[12] += t[4] + t[6] */ + /* a[13] += t[5] + t[7] */ + /* a[14] += t[6] */ + /* a[15] += t[7] */ + "LDM r0, {r0, r1, r2, r3, r4}\n\t" + "ADDS r0, r0, lr\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "MOV lr, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r8\n\t" + "ADCS r1, r1, r9\n\t" + "ADCS r2, r2, r10\n\t" + "ADCS r3, r3, r11\n\t" + "ADCS r4, r4, r12\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r0, r0, r10\n\t" + "ADCS r1, r1, r11\n\t" + "ADCS r2, r2, r12\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADC lr, lr, #0x0\n\t" + "STR r0, [sp, #44]\n\t" + "STR r1, [sp, #48]\n\t" + "STR r2, [sp, #52]\n\t" + "STR r3, [sp, #56]\n\t" + /* a[7..15] - t[0..7] */ + "ADD r0, sp, #0x1c\n\t" + "LDM r0, {r0, r1, r2, r3}\n\t" + "SUBS r0, r0, r5\n\t" + "SBCS r1, r1, r6\n\t" + "SBCS r2, r2, r7\n\t" + "SBCS r3, r3, r8\n\t" + "ADD r0, sp, #0x2c\n\t" + "MOV r8, r4\n\t" + "LDM r0, {r4, r5, r6, r7}\n\t" + "SBCS r4, r4, r9\n\t" + "SBCS r5, r5, r10\n\t" + "SBCS r6, r6, r11\n\t" + "SBCS r7, r7, r12\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBC lr, lr, #0x0\n\t" + /* mask m and sub from result if overflow */ + "RSB lr, lr, #0x0\n\t" + "SUBS r1, r1, lr\n\t" + "SBCS r2, r2, lr\n\t" + "SBCS r3, r3, lr\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, lr, LSR #31\n\t" + "SBC r8, r8, lr\n\t" + "LDR %[a], [sp, #64]\n\t" + "STM %[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADD sp, sp, #0x44\n\t" : [a] "+r" (a) : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14" + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - - (void)m; - (void)mp; + (void)m_p; + (void)mp_p; } +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 256 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #32\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_256_mont_reduce_order_8_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #24\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+6] += m[6] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+7] += m[7] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[7] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[7] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #24\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "LDR r12, [%[a], #32]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_256_mont_reduce_order_8_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_256_mont_reduce_order_8_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - ca); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 256 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_256_mont_reduce_order_8(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_256_mont_reduce_order_8_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #32]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #28]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #32]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x20\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_256_mont_reduce_order_8_word_%=\n\t" +#else + "BLT.N L_sp_256_mont_reduce_order_8_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_256_cond_sub_8(a - 8, a, m, (sp_digit)0 - mp); +} + +#endif +#endif /* WOLFSSL_SP_SMALL */ /* Map the Montgomery form projective coordinate point to an affine point. * * r Resulting affine coordinate point. @@ -18834,68 +32985,53 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p, * b Second number to add in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_256_mont_add_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[a],#8]\n\t" - "ldr r8, [%[a],#12]\n\t" - "ldr r9, [%[b],#0]\n\t" - "ldr r10, [%[b],#4]\n\t" - "ldr r11, [%[b],#8]\n\t" - "ldr r14, [%[b],#12]\n\t" - "adds r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r11\n\t" - "adcs r8, r8, r14\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "str r6, [%[r],#8]\n\t" - "str r8, [%[r],#12]\n\t" - "ldr r4, [%[a],#16]\n\t" - "ldr r5, [%[a],#20]\n\t" - "ldr r6, [%[a],#24]\n\t" - "ldr r8, [%[a],#28]\n\t" - "ldr r9, [%[b],#16]\n\t" - "ldr r10, [%[b],#20]\n\t" - "ldr r11, [%[b],#24]\n\t" - "ldr r14, [%[b],#28]\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r11\n\t" - "adcs r8, r8, r14\n\t" - "adc r3, r12, #0\n\t" - "sub r3, r12, r3\n\t" - "and r12, r3, #1\n\t" - "ldr r9, [%[r],#0]\n\t" - "ldr r10, [%[r],#4]\n\t" - "ldr r11, [%[r],#8]\n\t" - "ldr r14, [%[r],#12]\n\t" - "subs r9, r9, r3\n\t" - "sbcs r10, r10, r3\n\t" - "sbcs r11, r11, r3\n\t" - "sbcs r14, r14, #0\n\t" - "sbcs r4, r4, #0\n\t" - "sbcs r5, r5, #0\n\t" - "sbcs r6, r6, r12\n\t" - "sbc r8, r8, r3\n\t" - "str r9, [%[r],#0]\n\t" - "str r10, [%[r],#4]\n\t" - "str r11, [%[r],#8]\n\t" - "str r14, [%[r],#12]\n\t" - "str r4, [%[r],#16]\n\t" - "str r5, [%[r],#20]\n\t" - "str r6, [%[r],#24]\n\t" - "str r8, [%[r],#28]\n\t" + "MOV lr, #0x0\n\t" + "LDM %[a], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "ADDS r5, r5, r3\n\t" + "ADCS r6, r6, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "ADCS r7, r7, r3\n\t" + "ADCS r8, r8, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "ADCS r9, r9, r3\n\t" + "ADCS r10, r10, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "ADCS r11, r11, r3\n\t" + "ADCS r12, r12, r4\n\t" + "ADC lr, lr, #0x0\n\t" + "RSB lr, lr, #0x0\n\t" + "SUBS r5, r5, lr\n\t" + "SBCS r6, r6, lr\n\t" + "SBCS r7, r7, lr\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBCS r11, r11, lr, LSR #31\n\t" + "SBCS r12, r12, lr\n\t" + "SBC %[b], %[b], %[b]\n\t" + "SUB lr, lr, %[b]\n\t" + "SUBS r5, r5, lr\n\t" + "SBCS r6, r6, lr\n\t" + "SBCS r7, r7, lr\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBCS r11, r11, lr, LSR #31\n\t" + "SBC r12, r12, lr\n\t" + "STM %[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); + (void)m_p; } /* Double a Montgomery form number (r = a + a % m). @@ -18904,51 +33040,48 @@ SP_NOINLINE static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const * a Number to double in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_256_mont_dbl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[a],#8]\n\t" - "ldr r8, [%[a],#12]\n\t" - "ldr r9, [%[a],#16]\n\t" - "ldr r10, [%[a],#20]\n\t" - "ldr r11, [%[a],#24]\n\t" - "ldr r14, [%[a],#28]\n\t" - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r11, r11, r11\n\t" - "adcs r14, r14, r14\n\t" - "adc r3, r12, #0\n\t" - "sub r3, r12, r3\n\t" - "and r12, r3, #1\n\t" - "subs r4, r4, r3\n\t" - "sbcs r5, r5, r3\n\t" - "sbcs r6, r6, r3\n\t" - "sbcs r8, r8, #0\n\t" - "sbcs r9, r9, #0\n\t" - "sbcs r10, r10, #0\n\t" - "sbcs r11, r11, r12\n\t" - "sbc r14, r14, r3\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "str r6, [%[r],#8]\n\t" - "str r8, [%[r],#12]\n\t" - "str r9, [%[r],#16]\n\t" - "str r10, [%[r],#20]\n\t" - "str r11, [%[r],#24]\n\t" - "str r14, [%[r],#28]\n\t" + "MOV r2, #0x0\n\t" + "LDM %[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "ADC r2, r2, #0x0\n\t" + "RSB r2, r2, #0x0\n\t" + "SUBS r4, r4, r2\n\t" + "SBCS r5, r5, r2\n\t" + "SBCS r6, r6, r2\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r2, LSR #31\n\t" + "SBCS r11, r11, r2\n\t" + "SBC %[a], %[a], %[a]\n\t" + "SUB r2, r2, %[a]\n\t" + "SUBS r4, r4, r2\n\t" + "SBCS r5, r5, r2\n\t" + "SBCS r6, r6, r2\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r2, LSR #31\n\t" + "SBC r11, r11, r2\n\t" + "STM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2" ); + (void)m_p; } /* Triple a Montgomery form number (r = a + a + a % m). @@ -18957,83 +33090,72 @@ SP_NOINLINE static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a, const * a Number to triple in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_256_mont_tpl_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "ldr r2, [%[a],#0]\n\t" - "ldr r3, [%[a],#4]\n\t" - "ldr r4, [%[a],#8]\n\t" - "ldr r5, [%[a],#12]\n\t" - "ldr r6, [%[a],#16]\n\t" - "ldr r8, [%[a],#20]\n\t" - "ldr r9, [%[a],#24]\n\t" - "ldr r10, [%[a],#28]\n\t" - "adds r2, r2, r2\n\t" - "adcs r3, r3, r3\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "mov r11, #0\n\t" - "mov r14, #0\n\t" - "adc r11, r11, r11\n\t" - "mov r12, r11\n\t" - "sub r11, r11, #1\n\t" - "mvn r11, r11\n\t" - "subs r2, r2, r11\n\t" - "sbcs r3, r3, r11\n\t" - "sbcs r4, r4, r11\n\t" - "sbcs r5, r5, r14\n\t" - "sbcs r6, r6, r14\n\t" - "sbcs r8, r8, r14\n\t" - "sbcs r9, r9, r12\n\t" - "sbc r10, r10, r11\n\t" - "ldr r12, [%[a],#0]\n\t" - "ldr r14, [%[a],#4]\n\t" - "adds r2, r2, r12\n\t" - "adcs r3, r3, r14\n\t" - "ldr r12, [%[a],#8]\n\t" - "ldr r14, [%[a],#12]\n\t" - "adcs r4, r4, r12\n\t" - "adcs r5, r5, r14\n\t" - "ldr r12, [%[a],#16]\n\t" - "ldr r14, [%[a],#20]\n\t" - "adcs r6, r6, r12\n\t" - "adcs r8, r8, r14\n\t" - "ldr r12, [%[a],#24]\n\t" - "ldr r14, [%[a],#28]\n\t" - "adcs r9, r9, r12\n\t" - "adcs r10, r10, r14\n\t" - "mov r11, #0\n\t" - "mov r14, #0\n\t" - "adc r11, r11, r11\n\t" - "mov r12, r11\n\t" - "sub r11, r11, #1\n\t" - "mvn r11, r11\n\t" - "subs r2, r2, r11\n\t" - "str r2, [%[r],#0]\n\t" - "sbcs r3, r3, r11\n\t" - "str r3, [%[r],#4]\n\t" - "sbcs r4, r4, r11\n\t" - "str r4, [%[r],#8]\n\t" - "sbcs r5, r5, r14\n\t" - "str r5, [%[r],#12]\n\t" - "sbcs r6, r6, r14\n\t" - "str r6, [%[r],#16]\n\t" - "sbcs r8, r8, r14\n\t" - "str r8, [%[r],#20]\n\t" - "sbcs r9, r9, r12\n\t" - "str r9, [%[r],#24]\n\t" - "sbc r10, r10, r11\n\t" - "str r10, [%[r],#28]\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "ADC r12, r12, #0x0\n\t" + "RSB r12, r12, #0x0\n\t" + "SUBS r4, r4, r12\n\t" + "SBCS r5, r5, r12\n\t" + "SBCS r6, r6, r12\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r12, LSR #31\n\t" + "SBCS r11, r11, r12\n\t" + "RSB r12, r12, #0x0\n\t" + "SBC r12, r12, #0x0\n\t" + "LDM %[a]!, {r2, r3}\n\t" + "ADDS r4, r4, r2\n\t" + "ADCS r5, r5, r3\n\t" + "LDM %[a]!, {r2, r3}\n\t" + "ADCS r6, r6, r2\n\t" + "ADCS r7, r7, r3\n\t" + "LDM %[a]!, {r2, r3}\n\t" + "ADCS r8, r8, r2\n\t" + "ADCS r9, r9, r3\n\t" + "LDM %[a]!, {r2, r3}\n\t" + "ADCS r10, r10, r2\n\t" + "ADCS r11, r11, r3\n\t" + "ADC r12, r12, #0x0\n\t" + "RSB r12, r12, #0x0\n\t" + "SUBS r4, r4, r12\n\t" + "SBCS r5, r5, r12\n\t" + "SBCS r6, r6, r12\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r12, LSR #31\n\t" + "SBCS r11, r11, r12\n\t" + "SBC r2, r2, r2\n\t" + "SUB r12, r12, r2\n\t" + "SUBS r4, r4, r12\n\t" + "SBCS r5, r5, r12\n\t" + "SBCS r6, r6, r12\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, r12, LSR #31\n\t" + "SBC r11, r11, r12\n\t" + "STM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r11", "r12", "r14", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3", "r12" ); + (void)m_p; } /* Subtract two Montgomery form numbers (r = a - b % m). @@ -19043,154 +33165,107 @@ SP_NOINLINE static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a, const * b Number to subtract with in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_256_mont_sub_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldr r4, [%[a],#0]\n\t" - "ldr r5, [%[a],#4]\n\t" - "ldr r6, [%[a],#8]\n\t" - "ldr r8, [%[a],#12]\n\t" - "ldr r9, [%[b],#0]\n\t" - "ldr r10, [%[b],#4]\n\t" - "ldr r11, [%[b],#8]\n\t" - "ldr r14, [%[b],#12]\n\t" - "subs r4, r4, r9\n\t" - "sbcs r5, r5, r10\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r8, r8, r14\n\t" - "str r4, [%[r],#0]\n\t" - "str r5, [%[r],#4]\n\t" - "str r6, [%[r],#8]\n\t" - "str r8, [%[r],#12]\n\t" - "ldr r4, [%[a],#16]\n\t" - "ldr r5, [%[a],#20]\n\t" - "ldr r6, [%[a],#24]\n\t" - "ldr r8, [%[a],#28]\n\t" - "ldr r9, [%[b],#16]\n\t" - "ldr r10, [%[b],#20]\n\t" - "ldr r11, [%[b],#24]\n\t" - "ldr r14, [%[b],#28]\n\t" - "sbcs r4, r4, r9\n\t" - "sbcs r5, r5, r10\n\t" - "sbcs r6, r6, r11\n\t" - "sbcs r8, r8, r14\n\t" - "sbc r3, r12, #0\n\t" - "and r12, r3, #1\n\t" - "ldr r9, [%[r],#0]\n\t" - "ldr r10, [%[r],#4]\n\t" - "ldr r11, [%[r],#8]\n\t" - "ldr r14, [%[r],#12]\n\t" - "adds r9, r9, r3\n\t" - "adcs r10, r10, r3\n\t" - "adcs r11, r11, r3\n\t" - "adcs r14, r14, #0\n\t" - "adcs r4, r4, #0\n\t" - "adcs r5, r5, #0\n\t" - "adcs r6, r6, r12\n\t" - "adc r8, r8, r3\n\t" - "str r9, [%[r],#0]\n\t" - "str r10, [%[r],#4]\n\t" - "str r11, [%[r],#8]\n\t" - "str r14, [%[r],#12]\n\t" - "str r4, [%[r],#16]\n\t" - "str r5, [%[r],#20]\n\t" - "str r6, [%[r],#24]\n\t" - "str r8, [%[r],#28]\n\t" + "MOV lr, #0x0\n\t" + "LDM %[a], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "SUBS r5, r5, r3\n\t" + "SBCS r6, r6, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "SBCS r7, r7, r3\n\t" + "SBCS r8, r8, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "SBCS r9, r9, r3\n\t" + "SBCS r10, r10, r4\n\t" + "LDM %[b]!, {r3, r4}\n\t" + "SBCS r11, r11, r3\n\t" + "SBCS r12, r12, r4\n\t" + "SBC lr, lr, #0x0\n\t" + "ADDS r5, r5, lr\n\t" + "ADCS r6, r6, lr\n\t" + "ADCS r7, r7, lr\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, lr, LSR #31\n\t" + "ADCS r12, r12, lr\n\t" + "ADC lr, lr, #0x0\n\t" + "ADDS r5, r5, lr\n\t" + "ADCS r6, r6, lr\n\t" + "ADCS r7, r7, lr\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, lr, LSR #31\n\t" + "ADC r12, r12, lr\n\t" + "STM %[r], {r5, r6, r7, r8, r9, r10, r11, r12}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : [r] "r" (r), [a] "r" (a), [b] "r" (b) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r14", "r3", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); + (void)m_p; } -#define sp_256_mont_sub_lower_8 sp_256_mont_sub_8 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. * a Number to divide. * m Modulus (prime). */ -SP_NOINLINE static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "ldr r8, [%[a], #0]\n\t" - "lsl r8, r8, #31\n\t" - "lsr r8, r8, #31\n\t" - "mov r5, #0\n\t" - "sub r5, r5, r8\n\t" - "mov r8, #0\n\t" - "lsl r6, r5, #31\n\t" - "lsr r6, r6, #31\n\t" - "ldr r3, [%[a], #0]\n\t" - "ldr r4, [%[a], #4]\n\t" - "adds r3, r3, r5\n\t" - "adcs r4, r4, r5\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "adcs r3, r3, r5\n\t" - "adcs r4, r4, r8\n\t" - "str r3, [%[r], #8]\n\t" - "str r4, [%[r], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "adcs r3, r3, r8\n\t" - "adcs r4, r4, r8\n\t" - "str r3, [%[r], #16]\n\t" - "str r4, [%[r], #20]\n\t" - "ldr r3, [%[a], #24]\n\t" - "ldr r4, [%[a], #28]\n\t" - "adcs r3, r3, r6\n\t" - "adcs r4, r4, r5\n\t" - "adc r8, r8, r8\n\t" - "lsl r8, r8, #31\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, #31\n\t" - "lsr r6, r4, #1\n\t" - "lsl r4, r4, #31\n\t" - "orr r5, r5, r4\n\t" - "orr r6, r6, r8\n\t" - "mov r8, r3\n\t" - "str r5, [%[r], #24]\n\t" - "str r6, [%[r], #28]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, #31\n\t" - "lsr r6, r4, #1\n\t" - "lsl r4, r4, #31\n\t" - "orr r5, r5, r4\n\t" - "orr r6, r6, r8\n\t" - "mov r8, r3\n\t" - "str r5, [%[r], #16]\n\t" - "str r6, [%[r], #20]\n\t" - "ldr r3, [%[a], #8]\n\t" - "ldr r4, [%[a], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, #31\n\t" - "lsr r6, r4, #1\n\t" - "lsl r4, r4, #31\n\t" - "orr r5, r5, r4\n\t" - "orr r6, r6, r8\n\t" - "mov r8, r3\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[r], #0]\n\t" - "ldr r4, [%[r], #4]\n\t" - "lsr r5, r3, #1\n\t" - "lsr r6, r4, #1\n\t" - "lsl r4, r4, #31\n\t" - "orr r5, r5, r4\n\t" - "orr r6, r6, r8\n\t" - "str r5, [%[r], #0]\n\t" - "str r6, [%[r], #4]\n\t" + "LDM %[a], {r4, r5, r6, r7}\n\t" + "AND r3, r4, #0x1\n\t" + "RSB r8, r3, #0x0\n\t" + "AND r9, r8, #0x1\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r8\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, #0x0\n\t" + "STM %[r], {r4, r5, r6, r7}\n\t" + "LDRD r4, r5, [%[a], #16]\n\t" + "LDRD r6, r7, [%[a], #24]\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, r9\n\t" + "ADCS r7, r7, r8\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "LSR r8, r4, #1\n\t" + "LSR r9, r5, #1\n\t" + "LSR r10, r6, #1\n\t" + "LSR r11, r7, #1\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r6, lsl #31\n\t" + "ORR r10, r10, r7, lsl #31\n\t" + "ORR r11, r11, r3, lsl #31\n\t" + "MOV r3, r4\n\t" + "STRD r8, r9, [%[r], #16]\n\t" + "STRD r10, r11, [%[r], #24]\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LSR r8, r4, #1\n\t" + "LSR r9, r5, #1\n\t" + "LSR r10, r6, #1\n\t" + "LSR r11, r7, #1\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r6, lsl #31\n\t" + "ORR r10, r10, r7, lsl #31\n\t" + "ORR r11, r11, r3, lsl #31\n\t" + "STM %[r], {r8, r9, r10, r11}\n\t" + : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "r3", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3" ); } @@ -19248,7 +33323,7 @@ static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p, /* X = X - Y */ sp_256_mont_sub_8(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -19370,7 +33445,7 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -19431,12 +33506,12 @@ static int sp_256_iszero_8(const sp_digit* a) static void sp_256_proj_point_add_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*8; + sp_digit* t2 = t + 4*8; + sp_digit* t3 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod); @@ -19458,17 +33533,9 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_8(t2, t2, t1, p256_mod); @@ -19487,20 +33554,31 @@ static void sp_256_proj_point_add_8(sp_point_256* r, sp_256_mont_dbl_8(t3, y, p256_mod); sp_256_mont_sub_8(x, x, t3, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(y, y, x, p256_mod); + sp_256_mont_sub_8(y, y, x, p256_mod); sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_8(y, y, t5, p256_mod); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -19546,12 +33624,12 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*8; - ctx->t3 = t + 4*8; - ctx->t4 = t + 6*8; - ctx->t5 = t + 8*8; - ctx->t6 = t + 10*8; + ctx->t6 = t; + ctx->t1 = t + 2*8; + ctx->t2 = t + 4*8; + ctx->t3 = t + 6*8; + ctx->t4 = t + 8*8; + ctx->t5 = t + 10*8; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -19658,7 +33736,7 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_8(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -19671,22 +33749,28 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -19958,8 +34042,6 @@ static int sp_256_ecc_mulmod_fast_8(sp_point_256* r, const sp_point_256* g, cons } #ifdef FP_ECC -#define sp_256_mont_dbl_lower_8 sp_256_mont_dbl_8 -#define sp_256_mont_tpl_lower_8 sp_256_mont_tpl_8 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -19998,7 +34080,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -20007,8 +34089,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -20028,7 +34110,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_8(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_8(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_8(a, t1, p256_mod); + sp_256_mont_tpl_8(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_8(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_8(b, t1, x, p256_mod, p256_mp_mod); @@ -20037,8 +34119,8 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i, sp_256_mont_dbl_8(t2, b, p256_mod); sp_256_mont_sub_8(x, x, t2, p256_mod); /* B = 2.(B - X) */ - sp_256_mont_sub_lower_8(t2, b, x, p256_mod); - sp_256_mont_dbl_lower_8(b, t2, p256_mod); + sp_256_mont_sub_8(t2, b, x, p256_mod); + sp_256_mont_dbl_8(b, t2, p256_mod); /* Z = Z*Y */ sp_256_mont_mul_8(z, z, y, p256_mod, p256_mp_mod); /* t1 = Y^4 */ @@ -20094,12 +34176,12 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*8; - sp_digit* t3 = t + 4*8; - sp_digit* t4 = t + 6*8; - sp_digit* t5 = t + 8*8; - sp_digit* t6 = t + 10*8; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*8; + sp_digit* t6 = t + 4*8; + sp_digit* t1 = t + 6*8; + sp_digit* t4 = t + 8*8; + sp_digit* t5 = t + 10*8; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -20115,13 +34197,9 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, sp_256_proj_point_dbl_8(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_8(t2, t2, p->x, p256_mod); @@ -20130,33 +34208,40 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_8(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(x, t1, t5, p256_mod); - sp_256_mont_dbl_8(t1, t3, p256_mod); - sp_256_mont_sub_8(x, x, t1, p256_mod); + sp_256_mont_sqr_8(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_8(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_8(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(t2, t2, t1, p256_mod); + sp_256_mont_dbl_8(t5, t3, p256_mod); + sp_256_mont_sub_8(x, t2, t5, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_8(t3, t3, x, p256_mod); + sp_256_mont_sub_8(t3, t3, x, p256_mod); sp_256_mont_mul_8(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_8(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_8(y, t3, t5, p256_mod); + sp_256_mont_mul_8(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_8(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 8; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 8; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 8; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 8; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 8; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -21072,7 +35157,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -22632,7 +36717,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -22691,38 +36776,26 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, * * a A single precision integer. */ -SP_NOINLINE static void sp_256_add_one_8(sp_digit* a) +static void sp_256_add_one_8(sp_digit* a_p) { + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r2, #1\n\t" - "ldr r1, [%[a], #0]\n\t" - "adds r1, r1, r2\n\t" - "mov r2, #0\n\t" - "str r1, [%[a], #0]\n\t" - "ldr r1, [%[a], #4]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #4]\n\t" - "ldr r1, [%[a], #8]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #8]\n\t" - "ldr r1, [%[a], #12]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #12]\n\t" - "ldr r1, [%[a], #16]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #16]\n\t" - "ldr r1, [%[a], #20]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #20]\n\t" - "ldr r1, [%[a], #24]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #24]\n\t" - "ldr r1, [%[a], #28]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #28]\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADDS r1, r1, #0x1\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + : [a] "+r" (a) : - : [a] "r" (a) - : "memory", "r1", "r2" + : "memory", "r1", "r2", "r3", "r4" ); } @@ -22817,7 +36890,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -22825,7 +36898,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -23101,126 +37174,183 @@ int sp_ecc_secret_gen_256_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #32\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x20\n\t" + "\n" + "L_sp_256_sub_in_pkace_8_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_256_sub_in_pkace_8_word_%=\n\t" +#else + "BNE.N L_sp_256_sub_in_pkace_8_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #else -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_256_sub_in_place_8(sp_digit* a, - const sp_digit* b) +static sp_digit sp_256_sub_in_place_8(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #32\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_256_mul_d_8_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_256_mul_d_8_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_256_mul_d_8_word_%=\n\t" +#endif + "STR r3, [%[r], #32]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_256_mul_d_8(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "STR r5, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -23230,49 +37360,122 @@ SP_NOINLINE static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_256_word_8_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_256_word_8_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* AND m into each word of a and store in r. * * r A single precision integer. @@ -23492,7 +37695,7 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, sp_256_mont_sqr_n_order_8(t2, t3, 4); /* t = a^ff = t2 * t3 */ sp_256_mont_mul_order_8(t, t2, t3); - /* t3= a^ff00 = t ^ 2 ^ 8 */ + /* t2= a^ff00 = t ^ 2 ^ 8 */ sp_256_mont_sqr_n_order_8(t2, t, 8); /* t = a^ffff = t2 * t */ sp_256_mont_mul_order_8(t, t2, t); @@ -23509,7 +37712,11 @@ static void sp_256_mont_inv_order_8(sp_digit* r, const sp_digit* a, /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */ sp_256_mont_mul_order_8(t2, t2, t); /* t2= a^ffffffff00000000ffffffffffffffffbce6 */ - for (i=127; i>=112; i--) { + sp_256_mont_sqr_order_8(t2, t2); + sp_256_mont_mul_order_8(t2, t2, a); + sp_256_mont_sqr_n_order_8(t2, t2, 5); + sp_256_mont_mul_order_8(t2, t2, t3); + for (i=121; i>=112; i--) { sp_256_mont_sqr_order_8(t2, t2); if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) { sp_256_mont_mul_order_8(t2, t2, a); @@ -23914,48 +38121,41 @@ int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W #endif /* HAVE_ECC_SIGN */ #ifndef WOLFSSL_SP_SMALL -static void sp_256_rshift1_8(sp_digit* r, const sp_digit* a) +static void sp_256_rshift1_8(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r10, #0\n\t" - "mov r9, #0\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #24]\n\t" - "ldr r6, [%[a], #28]\n\t" - "lsr r7, r3, #1\n\t" - "and r3, r3, #1\n\t" - "lsr r8, r4, #1\n\t" - "lsr r10, r5, #1\n\t" - "lsr r14, r6, #1\n\t" - "orr r7, r7, r4, lsl #31\n\t" - "orr r8, r8, r5, lsl #31\n\t" - "orr r10, r10, r6, lsl #31\n\t" - "orr r14, r14, r9, lsl #31\n\t" - "mov r9, r3\n\t" - "str r7, [%[r], #16]\n\t" - "str r8, [%[r], #20]\n\t" - "str r10, [%[r], #24]\n\t" - "str r14, [%[r], #28]\n\t" - "ldr r3, [%[r], #0]\n\t" - "ldr r4, [%[r], #4]\n\t" - "ldr r5, [%[r], #8]\n\t" - "ldr r6, [%[r], #12]\n\t" - "lsr r7, r3, #1\n\t" - "lsr r8, r4, #1\n\t" - "lsr r10, r5, #1\n\t" - "lsr r14, r6, #1\n\t" - "orr r7, r7, r4, lsl #31\n\t" - "orr r8, r8, r5, lsl #31\n\t" - "orr r10, r10, r6, lsl #31\n\t" - "orr r14, r14, r9, lsl #31\n\t" - "str r7, [%[r], #0]\n\t" - "str r8, [%[r], #4]\n\t" - "str r10, [%[r], #8]\n\t" - "str r14, [%[r], #12]\n\t" + "MOV r10, #0x0\n\t" + "LDRD r2, r3, [%[a], #16]\n\t" + "LDRD r4, r5, [%[a], #24]\n\t" + "LSR r6, r2, #1\n\t" + "LSR r7, r3, #1\n\t" + "LSR r8, r4, #1\n\t" + "LSR r9, r5, #1\n\t" + "ORR r6, r6, r3, lsl #31\n\t" + "ORR r7, r7, r4, lsl #31\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "MOV r10, r2\n\t" + "STRD r6, r7, [%[r], #16]\n\t" + "STRD r8, r9, [%[r], #24]\n\t" + "LDRD r2, r3, [%[a]]\n\t" + "LDRD r4, r5, [%[a], #8]\n\t" + "LSR r6, r2, #1\n\t" + "LSR r7, r3, #1\n\t" + "LSR r8, r4, #1\n\t" + "LSR r9, r5, #1\n\t" + "ORR r6, r6, r3, lsl #31\n\t" + "ORR r7, r7, r4, lsl #31\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "STRD r6, r7, [%[r]]\n\t" + "STRD r8, r9, [%[r], #8]\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10", "r14", "r9" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -23965,156 +38165,178 @@ static void sp_256_rshift1_8(sp_digit* r, const sp_digit* a) * a Number to divide. * m Modulus. */ -static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_256_div2_mod_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "mov r10, #0\n\t" - "ldr r3, [%[a], #0]\n\t" - "ands r9, r3, #1\n\t" - "beq 1f\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[a], #8]\n\t" - "ldr r6, [%[a], #12]\n\t" - "ldr r7, [%[m], #0]\n\t" - "ldr r8, [%[m], #4]\n\t" - "ldr r10, [%[m], #8]\n\t" - "ldr r14, [%[m], #12]\n\t" - "adds r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r14\n\t" - "str r3, [%[r], #0]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #8]\n\t" - "str r6, [%[r], #12]\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #24]\n\t" - "ldr r6, [%[a], #28]\n\t" - "ldr r7, [%[m], #16]\n\t" - "ldr r8, [%[m], #20]\n\t" - "ldr r10, [%[m], #24]\n\t" - "ldr r14, [%[m], #28]\n\t" - "adcs r3, r3, r7\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r10\n\t" - "adcs r6, r6, r14\n\t" - "adc r9, r10, r10\n\t" - "b 2f\n\t" - "\n1:\n\t" - "ldr r3, [%[a], #16]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #24]\n\t" - "ldr r6, [%[a], #28]\n\t" - "\n2:\n\t" - "lsr r7, r3, #1\n\t" - "and r3, r3, #1\n\t" - "lsr r8, r4, #1\n\t" - "lsr r10, r5, #1\n\t" - "lsr r14, r6, #1\n\t" - "orr r7, r7, r4, lsl #31\n\t" - "orr r8, r8, r5, lsl #31\n\t" - "orr r10, r10, r6, lsl #31\n\t" - "orr r14, r14, r9, lsl #31\n\t" - "mov r9, r3\n\t" - "str r7, [%[r], #16]\n\t" - "str r8, [%[r], #20]\n\t" - "str r10, [%[r], #24]\n\t" - "str r14, [%[r], #28]\n\t" - "ldr r3, [%[r], #0]\n\t" - "ldr r4, [%[r], #4]\n\t" - "ldr r5, [%[r], #8]\n\t" - "ldr r6, [%[r], #12]\n\t" - "lsr r7, r3, #1\n\t" - "lsr r8, r4, #1\n\t" - "lsr r10, r5, #1\n\t" - "lsr r14, r6, #1\n\t" - "orr r7, r7, r4, lsl #31\n\t" - "orr r8, r8, r5, lsl #31\n\t" - "orr r10, r10, r6, lsl #31\n\t" - "orr r14, r14, r9, lsl #31\n\t" - "str r7, [%[r], #0]\n\t" - "str r8, [%[r], #4]\n\t" - "str r10, [%[r], #8]\n\t" - "str r14, [%[r], #12]\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r4}\n\t" + "ANDS r3, r4, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_div2_mod_8_even_%=\n\t" +#else + "BEQ.N L_sp_256_div2_mod_8_even_%=\n\t" +#endif + "LDM %[a]!, {r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "ADC r3, r12, r12\n\t" + "B L_sp_256_div2_mod_8_div2_%=\n\t" + "\n" + "L_sp_256_div2_mod_8_even_%=:\n\t" + "LDRD r4, r5, [%[a], #12]\n\t" + "LDRD r6, r7, [%[a], #20]\n\t" + "\n" + "L_sp_256_div2_mod_8_div2_%=:\n\t" + "LSR r8, r4, #1\n\t" + "AND r4, r4, #0x1\n\t" + "LSR r9, r5, #1\n\t" + "LSR r10, r6, #1\n\t" + "LSR r11, r7, #1\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r6, lsl #31\n\t" + "ORR r10, r10, r7, lsl #31\n\t" + "ORR r11, r11, r3, lsl #31\n\t" + "MOV r3, r4\n\t" + "STRD r8, r9, [%[r], #16]\n\t" + "STRD r10, r11, [%[r], #24]\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LSR r8, r4, #1\n\t" + "LSR r9, r5, #1\n\t" + "LSR r10, r6, #1\n\t" + "LSR r11, r7, #1\n\t" + "ORR r8, r8, r5, lsl #31\n\t" + "ORR r9, r9, r6, lsl #31\n\t" + "ORR r10, r10, r7, lsl #31\n\t" + "ORR r11, r11, r3, lsl #31\n\t" + "STM %[r], {r8, r9, r10, r11}\n\t" + : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10", "r14", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); } -static int sp_256_num_bits_8(sp_digit* a) +static int sp_256_num_bits_8(const sp_digit* a_p) { - int r = 0; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "ldr r2, [%[a], #28]\n\t" - "cmp r2, #0\n\t" - "beq 7f\n\t" - "mov r3, #256\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n7:\n\t" - "ldr r2, [%[a], #24]\n\t" - "cmp r2, #0\n\t" - "beq 6f\n\t" - "mov r3, #224\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n6:\n\t" - "ldr r2, [%[a], #20]\n\t" - "cmp r2, #0\n\t" - "beq 5f\n\t" - "mov r3, #192\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n5:\n\t" - "ldr r2, [%[a], #16]\n\t" - "cmp r2, #0\n\t" - "beq 4f\n\t" - "mov r3, #160\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n4:\n\t" - "ldr r2, [%[a], #12]\n\t" - "cmp r2, #0\n\t" - "beq 3f\n\t" - "mov r3, #128\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n3:\n\t" - "ldr r2, [%[a], #8]\n\t" - "cmp r2, #0\n\t" - "beq 2f\n\t" - "mov r3, #96\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n2:\n\t" - "ldr r2, [%[a], #4]\n\t" - "cmp r2, #0\n\t" - "beq 1f\n\t" - "mov r3, #64\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 9f\n\t" - "\n1:\n\t" - "ldr r2, [%[a], #0]\n\t" - "mov r3, #32\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "\n9:\n\t" - : [r] "+r" (r) - : [a] "r" (a) - : "r2", "r3" + "LDR r1, [%[a], #28]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_7_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_7_%=\n\t" +#endif + "MOV r2, #0x100\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_7_%=:\n\t" + "LDR r1, [%[a], #24]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_6_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_6_%=\n\t" +#endif + "MOV r2, #0xe0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_6_%=:\n\t" + "LDR r1, [%[a], #20]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_5_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_5_%=\n\t" +#endif + "MOV r2, #0xc0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_5_%=:\n\t" + "LDR r1, [%[a], #16]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_4_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_4_%=\n\t" +#endif + "MOV r2, #0xa0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_4_%=:\n\t" + "LDR r1, [%[a], #12]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_3_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_3_%=\n\t" +#endif + "MOV r2, #0x80\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_3_%=:\n\t" + "LDR r1, [%[a], #8]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_2_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_2_%=\n\t" +#endif + "MOV r2, #0x60\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_2_%=:\n\t" + "LDR r1, [%[a], #4]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_256_num_bits_8_1_%=\n\t" +#else + "BEQ.N L_sp_256_num_bits_8_1_%=\n\t" +#endif + "MOV r2, #0x40\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_256_num_bits_8_9_%=\n\t" + "\n" + "L_sp_256_num_bits_8_1_%=:\n\t" + "LDR r1, [%[a]]\n\t" + "MOV r2, #0x20\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "\n" + "L_sp_256_num_bits_8_9_%=:\n\t" + "MOV %[a], r4\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5" ); - - return r; + return (uint32_t)(size_t)a; } /* Non-constant time modular inversion. @@ -25200,215 +39422,1902 @@ static const sp_digit p384_b[12] = { }; #endif +#ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_384_mul_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[12 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #48\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #44\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #88\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x60\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_384_mul_12_outer_%=:\n\t" + "SUBS r3, r5, #0x2c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_384_mul_12_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x30\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_mul_12_inner_done_%=\n\t" +#else + "BEQ.N L_sp_384_mul_12_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_384_mul_12_inner_%=\n\t" +#else + "BLE.N L_sp_384_mul_12_inner_%=\n\t" +#endif + "\n" + "L_sp_384_mul_12_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x58\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_384_mul_12_outer_%=\n\t" +#else + "BLE.N L_sp_384_mul_12_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_384_mul_12_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_384_mul_12_store_%=\n\t" +#else + "BGT.N L_sp_384_mul_12_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x30\n\t" + /* A[0] * B[0] */ + "LDR r11, [%[a]]\n\t" + "LDR r12, [%[b]]\n\t" + "UMULL r3, r4, r11, r12\n\t" + "MOV r5, #0x0\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[0] */ + "LDR r8, [%[a], #4]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + /* A[2] * B[0] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[1] */ + "LDR r11, [%[a], #4]\n\t" + "LDR r12, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[2] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #8]\n\t" + /* A[0] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[2] */ + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[1] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[0] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #12]\n\t" + /* A[4] * B[0] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[1] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[2] */ + "LDR r11, [%[a], #8]\n\t" + "LDR r12, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[3] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[4] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #16]\n\t" + /* A[0] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[4] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[2] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[1] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[0] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #20]\n\t" + /* A[6] * B[0] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[1] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[2] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[3] */ + "LDR r11, [%[a], #12]\n\t" + "LDR r12, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[4] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[5] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[6] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #24]\n\t" + /* A[0] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[6] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[5] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[4] */ + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[3] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[2] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[1] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[0] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* A[8] * B[0] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[1] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[2] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[3] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[4] */ + "LDR r11, [%[a], #16]\n\t" + "LDR r12, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[5] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[6] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[7] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[8] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #32]\n\t" + /* A[0] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[8] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[7] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[6] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[4] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[3] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[2] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[1] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[0] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #36]\n\t" + /* A[10] * B[0] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[1] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[2] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[3] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[4] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[5] */ + "LDR r11, [%[a], #20]\n\t" + "LDR r12, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[6] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[7] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[8] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[9] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[10] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #40]\n\t" + /* A[0] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[10] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[9] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[8] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[7] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[6] */ + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[5] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[4] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[3] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[2] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[1] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[0] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #44]\n\t" + /* A[11] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[2] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[3] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[4] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[5] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[6] */ + "LDR r11, [%[a], #24]\n\t" + "LDR r12, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[7] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[8] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[9] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[10] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[11] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #48]\n\t" + /* A[2] * B[11] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[10] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[9] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[8] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[6] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[5] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[4] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[3] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[2] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #52]\n\t" + /* A[11] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[4] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[5] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[6] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[7] */ + "LDR r11, [%[a], #28]\n\t" + "LDR r12, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[8] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[9] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[10] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[11] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #56]\n\t" + /* A[4] * B[11] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[10] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[9] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[8] */ + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[7] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[6] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[5] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[4] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #60]\n\t" + /* A[11] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[6] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[7] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[8] */ + "LDR r11, [%[a], #32]\n\t" + "LDR r12, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[9] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[10] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[11] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #64]\n\t" + /* A[6] * B[11] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[10] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[8] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[7] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[6] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #68]\n\t" + /* A[11] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[8] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[9] */ + "LDR r11, [%[a], #36]\n\t" + "LDR r12, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[10] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[11] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #72]\n\t" + /* A[8] * B[11] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[10] */ + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[9] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[8] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #76]\n\t" + /* A[11] * B[9] */ + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[10] */ + "LDR r11, [%[a], #40]\n\t" + "LDR r12, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[11] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #80]\n\t" + /* A[10] * B[11] */ + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[10] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #84]\n\t" + /* A[11] * B[11] */ + "UMLAL r4, r5, r8, r9\n\t" + "STR r4, [%[r], #88]\n\t" + "STR r5, [%[r], #92]\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) +static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #96\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #44\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x60\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_384_sqr_12_outer_%=:\n\t" + "SUBS r3, r5, #0x2c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_384_sqr_12_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_384_sqr_12_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_384_sqr_12_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_384_sqr_12_op_done_%=\n\t" + "\n" + "L_sp_384_sqr_12_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_384_sqr_12_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_384_sqr_12_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #48\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_384_sqr_12_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_384_sqr_12_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_384_sqr_12_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_384_sqr_12_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_384_sqr_12_inner_%=\n\t" +#endif + "\n" + "L_sp_384_sqr_12_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x58\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_384_sqr_12_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #88\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_384_sqr_12_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_384_sqr_12_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_384_sqr_12_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #92\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #96\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_384_sqr_12_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x30\n\t" + /* A[0] * A[0] */ + "LDR r10, [%[a]]\n\t" + "UMULL r8, r3, r10, r10\n\t" + "MOV r4, #0x0\n\t" + "STR r8, [sp]\n\t" + /* A[0] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #4]\n\t" + /* A[0] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #8]\n\t" + /* A[0] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [sp, #12]\n\t" + /* A[0] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[1] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[2] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #16]\n\t" + /* A[0] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #20]\n\t" + /* A[0] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #24]\n\t" + /* A[0] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #28]\n\t" + /* A[0] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #32]\n\t" + /* A[0] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #36]\n\t" + /* A[0] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #40]\n\t" + /* A[0] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #44]\n\t" + /* A[1] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[2] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #48]\n\t" + /* A[2] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[3] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #52]\n\t" + /* A[3] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[4] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #56]\n\t" + /* A[4] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[5] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #60]\n\t" + /* A[5] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[6] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #64]\n\t" + /* A[6] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[7] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #68]\n\t" + /* A[7] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #72]\n\t" + /* A[8] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[9] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [%[r], #76]\n\t" + /* A[9] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #80]\n\t" + /* A[10] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #84]\n\t" + /* A[11] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "UMLAL r3, r4, r10, r10\n\t" + "STR r3, [%[r], #88]\n\t" + "STR r4, [%[r], #92]\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * @@ -25416,39 +41325,39 @@ SP_NOINLINE static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #48\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x30\n\t" + "\n" + "L_sp_384_add_12_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_384_add_12_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_384_add_12_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -25458,50 +41367,41 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -25512,37 +41412,38 @@ SP_NOINLINE static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "add r6, r6, #48\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x30\n\t" + "\n" + "L_sp_384_sub_12_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_384_sub_12_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_384_sub_12_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -25552,49 +41453,40 @@ SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -25710,14 +41602,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -25895,6 +41787,7 @@ static int sp_384_point_to_ecc_point_12(const sp_point_384* p, ecc_point* pm) return err; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -25903,143 +41796,361 @@ static int sp_384_point_to_ecc_point_12(const sp_point_384* p, ecc_point* pm) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #48\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_384_cond_sub_12_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_384_cond_sub_12_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_384_cond_sub_12_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_384_cond_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ #define sp_384_mont_reduce_order_12 sp_384_mont_reduce_12 +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 384 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #48\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_384_mont_reduce_12_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #40\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+10] += m[10] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+11] += m[11] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[11] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[11] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #40\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "LDR r12, [%[a], #48]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_384_mont_reduce_12_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_384_mont_reduce_12_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - ca); + sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 384 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_384_mont_reduce_12(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_384_mont_reduce_12_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #48]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #44]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #48]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x30\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_384_mont_reduce_12_word_%=\n\t" +#else + "BLT.N L_sp_384_mont_reduce_12_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_384_cond_sub_12(a - 12, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -26189,44 +42300,175 @@ static void sp_384_mont_inv_12(sp_digit* r, const sp_digit* a, sp_digit* td) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_384_cmp_12(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #44\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x2c\n\t" + "\n" + "L_sp_384_cmp_12_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_384_cmp_12_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Normalize the values in each word to 32. @@ -26282,9 +42524,13 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p, * b Second number to add in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_384_mont_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; + sp_digit o; o = sp_384_add_12(r, a, b); @@ -26297,8 +42543,12 @@ SP_NOINLINE static void sp_384_mont_add_12(sp_digit* r, const sp_digit* a, const * a Number to double in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_384_mont_dbl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + sp_digit o; o = sp_384_add_12(r, a, a); @@ -26311,8 +42561,12 @@ SP_NOINLINE static void sp_384_mont_dbl_12(sp_digit* r, const sp_digit* a, const * a Number to triple in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_384_mont_tpl_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + sp_digit o; o = sp_384_add_12(r, a, a); @@ -26321,6 +42575,7 @@ SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a, const sp_384_cond_sub_12(r, r, m, 0 - o); } +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -26329,39 +42584,110 @@ SP_NOINLINE static void sp_384_mont_tpl_12(sp_digit* r, const sp_digit* a, const * b A single precision number to add. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #48\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r5, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "\n" + "L_sp_384_cond_add_12_words_%=:\n\t" + "ADDS r5, r5, #0x-1\n\t" + "LDR r6, [%[a], r4]\n\t" + "LDR r7, [%[b], r4]\n\t" + "AND r7, r7, %[m]\n\t" + "ADCS r6, r6, r7\n\t" + "ADC r5, r8, r8\n\t" + "STR r6, [%[r], r4]\n\t" + "ADD r4, r4, #0x4\n\t" + "CMP r4, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_384_cond_add_12_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_384_cond_add_12_words_%=\n\t" +#endif + "MOV %[r], r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_384_cond_add_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "ADC %[r], r10, r10\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Subtract two Montgomery form numbers (r = a - b % m). * * r Result of subtration. @@ -26369,68 +42695,76 @@ SP_NOINLINE static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, c * b Number to subtract with in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_384_mont_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_384_mont_sub_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; + sp_digit o; o = sp_384_sub_12(r, a, b); sp_384_cond_add_12(r, r, m, o); } -#define sp_384_mont_sub_lower_12 sp_384_mont_sub_12 -static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a) +#ifdef WOLFSSL_SP_SMALL +#else +#endif /* WOLFSSL_SP_SMALL */ +static void sp_384_rshift1_12(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" - "lsr r2, r2, #1\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "str r3, [%[r], #40]\n\t" - "str r4, [%[r], #44]\n\t" + "LDM %[a], {r2, r3}\n\t" + "LSR r2, r2, #1\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #8]\n\t" + "STR r2, [%[r]]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #12]\n\t" + "STR r3, [%[r], #4]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #16]\n\t" + "STR r4, [%[r], #8]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #20]\n\t" + "STR r2, [%[r], #12]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #24]\n\t" + "STR r3, [%[r], #16]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #28]\n\t" + "STR r4, [%[r], #20]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #32]\n\t" + "STR r2, [%[r], #24]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #36]\n\t" + "STR r3, [%[r], #28]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #40]\n\t" + "STR r4, [%[r], #32]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #44]\n\t" + "STR r2, [%[r], #36]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "STR r3, [%[r], #40]\n\t" + "STR r4, [%[r], #44]\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) : "memory", "r2", "r3", "r4" ); } @@ -26441,7 +42775,7 @@ static void sp_384_rshift1_12(sp_digit* r, const sp_digit* a) * a Number to divide. * m Modulus (prime). */ -SP_NOINLINE static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m) { sp_digit o; @@ -26504,7 +42838,7 @@ static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_12(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -26626,7 +42960,7 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -26689,12 +43023,12 @@ static int sp_384_iszero_12(const sp_digit* a) static void sp_384_proj_point_add_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*12; + sp_digit* t2 = t + 4*12; + sp_digit* t3 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod); @@ -26716,17 +43050,9 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_12(t2, t2, t1, p384_mod); @@ -26745,20 +43071,31 @@ static void sp_384_proj_point_add_12(sp_point_384* r, sp_384_mont_dbl_12(t3, y, p384_mod); sp_384_mont_sub_12(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(y, y, x, p384_mod); + sp_384_mont_sub_12(y, y, x, p384_mod); sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_12(y, y, t5, p384_mod); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -26804,12 +43141,12 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*12; - ctx->t3 = t + 4*12; - ctx->t4 = t + 6*12; - ctx->t5 = t + 8*12; - ctx->t6 = t + 10*12; + ctx->t6 = t; + ctx->t1 = t + 2*12; + ctx->t2 = t + 4*12; + ctx->t3 = t + 6*12; + ctx->t4 = t + 8*12; + ctx->t5 = t + 10*12; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -26916,7 +43253,7 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_12(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -26929,22 +43266,28 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -27240,8 +43583,6 @@ static int sp_384_ecc_mulmod_fast_12(sp_point_384* r, const sp_point_384* g, con } #ifdef FP_ECC -#define sp_384_mont_dbl_lower_12 sp_384_mont_dbl_12 -#define sp_384_mont_tpl_lower_12 sp_384_mont_tpl_12 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -27280,7 +43621,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -27289,8 +43630,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -27310,7 +43651,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_12(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_12(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_12(a, t1, p384_mod); + sp_384_mont_tpl_12(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_12(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_12(b, t1, x, p384_mod, p384_mp_mod); @@ -27319,8 +43660,8 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i, sp_384_mont_dbl_12(t2, b, p384_mod); sp_384_mont_sub_12(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_12(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_12(b, t2, p384_mod); + sp_384_mont_sub_12(t2, b, x, p384_mod); + sp_384_mont_dbl_12(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_12(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -27376,12 +43717,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*12; - sp_digit* t3 = t + 4*12; - sp_digit* t4 = t + 6*12; - sp_digit* t5 = t + 8*12; - sp_digit* t6 = t + 10*12; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*12; + sp_digit* t6 = t + 4*12; + sp_digit* t1 = t + 6*12; + sp_digit* t4 = t + 8*12; + sp_digit* t5 = t + 10*12; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -27397,13 +43738,9 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, sp_384_proj_point_dbl_12(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_12(t2, t2, p->x, p384_mod); @@ -27412,33 +43749,40 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_12(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(x, t1, t5, p384_mod); - sp_384_mont_dbl_12(t1, t3, p384_mod); - sp_384_mont_sub_12(x, x, t1, p384_mod); + sp_384_mont_sqr_12(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_12(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_12(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(t2, t2, t1, p384_mod); + sp_384_mont_dbl_12(t5, t3, p384_mod); + sp_384_mont_sub_12(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_12(t3, t3, x, p384_mod); + sp_384_mont_sub_12(t3, t3, x, p384_mod); sp_384_mont_mul_12(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_12(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_12(y, t3, t5, p384_mod); + sp_384_mont_mul_12(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_12(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 12; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 12; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 12; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 12; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 12; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -28386,7 +44730,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -29946,7 +46290,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -30005,50 +46349,32 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, * * a A single precision integer. */ -SP_NOINLINE static void sp_384_add_one_12(sp_digit* a) +static void sp_384_add_one_12(sp_digit* a_p) { + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r2, #1\n\t" - "ldr r1, [%[a], #0]\n\t" - "adds r1, r1, r2\n\t" - "mov r2, #0\n\t" - "str r1, [%[a], #0]\n\t" - "ldr r1, [%[a], #4]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #4]\n\t" - "ldr r1, [%[a], #8]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #8]\n\t" - "ldr r1, [%[a], #12]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #12]\n\t" - "ldr r1, [%[a], #16]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #16]\n\t" - "ldr r1, [%[a], #20]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #20]\n\t" - "ldr r1, [%[a], #24]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #24]\n\t" - "ldr r1, [%[a], #28]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #28]\n\t" - "ldr r1, [%[a], #32]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #32]\n\t" - "ldr r1, [%[a], #36]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #36]\n\t" - "ldr r1, [%[a], #40]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #40]\n\t" - "ldr r1, [%[a], #44]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #44]\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADDS r1, r1, #0x1\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + : [a] "+r" (a) : - : [a] "r" (a) - : "memory", "r1", "r2" + : "memory", "r1", "r2", "r3", "r4" ); } @@ -30143,7 +46469,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -30151,7 +46477,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -30427,136 +46753,210 @@ int sp_ecc_secret_gen_384_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #48\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x30\n\t" + "\n" + "L_sp_384_sub_in_pkace_12_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_384_sub_in_pkace_12_word_%=\n\t" +#else + "BNE.N L_sp_384_sub_in_pkace_12_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #else -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_384_sub_in_place_12(sp_digit* a, - const sp_digit* b) +static sp_digit sp_384_sub_in_place_12(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #48\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_384_mul_d_12_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_384_mul_d_12_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_384_mul_d_12_word_%=\n\t" +#endif + "STR r3, [%[r], #48]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_384_mul_d_12(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "STR r3, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -30566,49 +46966,122 @@ SP_NOINLINE static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_384_word_12_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_384_word_12_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* AND m into each word of a and store in r. * * r A single precision integer. @@ -31231,245 +47704,268 @@ int sp_ecc_sign_384_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W * a Number to divide. * m Modulus. */ -static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_384_div2_mod_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "ldr r4, [%[a]]\n\t" - "ands r8, r4, #1\n\t" - "beq 1f\n\t" - "mov r12, #0\n\t" - "ldr r5, [%[a], #4]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" - "ldr r8, [%[m], #0]\n\t" - "ldr r9, [%[m], #4]\n\t" - "ldr r10, [%[m], #8]\n\t" - "ldr r14, [%[m], #12]\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #4]\n\t" - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" - "ldr r8, [%[m], #16]\n\t" - "ldr r9, [%[m], #20]\n\t" - "ldr r10, [%[m], #24]\n\t" - "ldr r14, [%[m], #28]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" - "str r6, [%[r], #24]\n\t" - "str r7, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[a], #36]\n\t" - "ldr r6, [%[a], #40]\n\t" - "ldr r7, [%[a], #44]\n\t" - "ldr r8, [%[m], #32]\n\t" - "ldr r9, [%[m], #36]\n\t" - "ldr r10, [%[m], #40]\n\t" - "ldr r14, [%[m], #44]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #36]\n\t" - "str r6, [%[r], #40]\n\t" - "str r7, [%[r], #44]\n\t" - "adc r8, r12, r12\n\t" - "b 2f\n\t" - "\n1:\n\t" - "ldr r5, [%[a], #2]\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #2]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[a], #6]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #6]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #10]\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #10]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[a], #14]\n\t" - "str r4, [%[r], #12]\n\t" - "str r5, [%[r], #14]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #18]\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #18]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #22]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #22]\n\t" - "\n2:\n\t" - "ldr r3, [%[r]]\n\t" - "ldr r4, [%[r], #4]\n\t" - "lsr r3, r3, #1\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #8]\n\t" - "str r3, [%[r], #0]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r4, [%[r], #4]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r5, [%[r], #8]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #20]\n\t" - "str r3, [%[r], #12]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r4, [%[r], #16]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r5, [%[r], #20]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #32]\n\t" - "str r3, [%[r], #24]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r4, [%[r], #28]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r5, [%[r], #32]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #44]\n\t" - "str r3, [%[r], #36]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "orr r5, r5, r8, lsl #31\n\t" - "str r4, [%[r], #40]\n\t" - "str r5, [%[r], #44]\n\t" + "LDM %[a]!, {r4}\n\t" + "ANDS r3, r4, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_div2_mod_12_even_%=\n\t" +#else + "BEQ.N L_sp_384_div2_mod_12_even_%=\n\t" +#endif + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "ADC r3, r12, r12\n\t" + "B L_sp_384_div2_mod_12_div2_%=\n\t" + "\n" + "L_sp_384_div2_mod_12_even_%=:\n\t" + "LDM %[a]!, {r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "\n" + "L_sp_384_div2_mod_12_div2_%=:\n\t" + "SUB %[r], %[r], #0x30\n\t" + "LDRD r8, r9, [%[r]]\n\t" + "LSR r8, r8, #1\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #8]\n\t" + "STR r8, [%[r]]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #12]\n\t" + "STR r9, [%[r], #4]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #16]\n\t" + "STR r10, [%[r], #8]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #20]\n\t" + "STR r8, [%[r], #12]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #24]\n\t" + "STR r9, [%[r], #16]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #28]\n\t" + "STR r10, [%[r], #20]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #32]\n\t" + "STR r8, [%[r], #24]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #36]\n\t" + "STR r9, [%[r], #28]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #40]\n\t" + "STR r10, [%[r], #32]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #44]\n\t" + "STR r8, [%[r], #36]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "ORR r10, r10, r3, lsl #31\n\t" + "STR r9, [%[r], #40]\n\t" + "STR r10, [%[r], #44]\n\t" + : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); } -static int sp_384_num_bits_12(sp_digit* a) +static int sp_384_num_bits_12(const sp_digit* a_p) { - int r = 0; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "ldr r2, [%[a], #44]\n\t" - "cmp r2, #0\n\t" - "beq 11f\n\t" - "mov r3, #384\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n11:\n\t" - "ldr r2, [%[a], #40]\n\t" - "cmp r2, #0\n\t" - "beq 10f\n\t" - "mov r3, #352\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n10:\n\t" - "ldr r2, [%[a], #36]\n\t" - "cmp r2, #0\n\t" - "beq 9f\n\t" - "mov r3, #320\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n9:\n\t" - "ldr r2, [%[a], #32]\n\t" - "cmp r2, #0\n\t" - "beq 8f\n\t" - "mov r3, #288\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n8:\n\t" - "ldr r2, [%[a], #28]\n\t" - "cmp r2, #0\n\t" - "beq 7f\n\t" - "mov r3, #256\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n7:\n\t" - "ldr r2, [%[a], #24]\n\t" - "cmp r2, #0\n\t" - "beq 6f\n\t" - "mov r3, #224\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n6:\n\t" - "ldr r2, [%[a], #20]\n\t" - "cmp r2, #0\n\t" - "beq 5f\n\t" - "mov r3, #192\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n5:\n\t" - "ldr r2, [%[a], #16]\n\t" - "cmp r2, #0\n\t" - "beq 4f\n\t" - "mov r3, #160\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n4:\n\t" - "ldr r2, [%[a], #12]\n\t" - "cmp r2, #0\n\t" - "beq 3f\n\t" - "mov r3, #128\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n3:\n\t" - "ldr r2, [%[a], #8]\n\t" - "cmp r2, #0\n\t" - "beq 2f\n\t" - "mov r3, #96\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n2:\n\t" - "ldr r2, [%[a], #4]\n\t" - "cmp r2, #0\n\t" - "beq 1f\n\t" - "mov r3, #64\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 13f\n\t" - "\n1:\n\t" - "ldr r2, [%[a], #0]\n\t" - "mov r3, #32\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "\n13:\n\t" - : [r] "+r" (r) - : [a] "r" (a) - : "r2", "r3" + "LDR r1, [%[a], #44]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_11_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_11_%=\n\t" +#endif + "MOV r2, #0x180\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_11_%=:\n\t" + "LDR r1, [%[a], #40]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_10_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_10_%=\n\t" +#endif + "MOV r2, #0x160\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_10_%=:\n\t" + "LDR r1, [%[a], #36]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_9_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_9_%=\n\t" +#endif + "MOV r2, #0x140\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_9_%=:\n\t" + "LDR r1, [%[a], #32]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_8_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_8_%=\n\t" +#endif + "MOV r2, #0x120\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_8_%=:\n\t" + "LDR r1, [%[a], #28]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_7_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_7_%=\n\t" +#endif + "MOV r2, #0x100\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_7_%=:\n\t" + "LDR r1, [%[a], #24]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_6_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_6_%=\n\t" +#endif + "MOV r2, #0xe0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_6_%=:\n\t" + "LDR r1, [%[a], #20]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_5_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_5_%=\n\t" +#endif + "MOV r2, #0xc0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_5_%=:\n\t" + "LDR r1, [%[a], #16]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_4_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_4_%=\n\t" +#endif + "MOV r2, #0xa0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_4_%=:\n\t" + "LDR r1, [%[a], #12]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_3_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_3_%=\n\t" +#endif + "MOV r2, #0x80\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_3_%=:\n\t" + "LDR r1, [%[a], #8]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_2_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_2_%=\n\t" +#endif + "MOV r2, #0x60\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_2_%=:\n\t" + "LDR r1, [%[a], #4]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_384_num_bits_12_1_%=\n\t" +#else + "BEQ.N L_sp_384_num_bits_12_1_%=\n\t" +#endif + "MOV r2, #0x40\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_384_num_bits_12_13_%=\n\t" + "\n" + "L_sp_384_num_bits_12_1_%=:\n\t" + "LDR r1, [%[a]]\n\t" + "MOV r2, #0x20\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "\n" + "L_sp_384_num_bits_12_13_%=:\n\t" + "MOV %[a], r4\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5" ); - - return r; + return (uint32_t)(size_t)a; } /* Non-constant time modular inversion. @@ -32601,215 +49097,3506 @@ static const sp_digit p521_b[17] = { }; #endif +#ifdef WOLFSSL_SP_SMALL /* Multiply a and b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_521_mul_17(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[17 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #68\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #64\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #128\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x88\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_521_mul_17_outer_%=:\n\t" + "SUBS r3, r5, #0x40\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_521_mul_17_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x44\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_mul_17_inner_done_%=\n\t" +#else + "BEQ.N L_sp_521_mul_17_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_521_mul_17_inner_%=\n\t" +#else + "BLE.N L_sp_521_mul_17_inner_%=\n\t" +#endif + "\n" + "L_sp_521_mul_17_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_521_mul_17_outer_%=\n\t" +#else + "BLE.N L_sp_521_mul_17_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "LDM sp!, {r6, r7}\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SUB r5, r5, #0x8\n\t" + "\n" + "L_sp_521_mul_17_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_521_mul_17_store_%=\n\t" +#else + "BGT.N L_sp_521_mul_17_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } +#else +/* Multiply a and b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision integer. + */ +static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + /* A[0] * B[0] */ + "LDR r11, [%[a]]\n\t" + "LDR r12, [%[b]]\n\t" + "UMULL r3, r4, r11, r12\n\t" + "MOV r5, #0x0\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[0] */ + "LDR r8, [%[a], #4]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + /* A[2] * B[0] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[1] */ + "LDR r11, [%[a], #4]\n\t" + "LDR r12, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[2] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #8]\n\t" + /* A[0] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[2] */ + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[1] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[0] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #12]\n\t" + /* A[4] * B[0] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[1] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[2] */ + "LDR r11, [%[a], #8]\n\t" + "LDR r12, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[3] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[4] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #16]\n\t" + /* A[0] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[4] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[2] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[1] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[0] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #20]\n\t" + /* A[6] * B[0] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[1] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[2] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[3] */ + "LDR r11, [%[a], #12]\n\t" + "LDR r12, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[4] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[5] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[6] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #24]\n\t" + /* A[0] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[6] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[5] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[4] */ + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[3] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[2] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[1] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[0] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* A[8] * B[0] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[1] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[2] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[3] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[4] */ + "LDR r11, [%[a], #16]\n\t" + "LDR r12, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[5] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[6] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[7] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[8] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #32]\n\t" + /* A[0] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[8] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[7] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[6] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[4] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[3] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[2] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[1] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[0] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #36]\n\t" + /* A[10] * B[0] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[1] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[2] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[3] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[4] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[5] */ + "LDR r11, [%[a], #20]\n\t" + "LDR r12, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[6] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[7] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[8] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[9] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[10] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #40]\n\t" + /* A[0] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[10] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[9] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[8] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[7] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[6] */ + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[5] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[4] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[3] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[2] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[1] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[0] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #44]\n\t" + /* A[12] * B[0] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[1] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[2] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[3] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[4] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[5] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[6] */ + "LDR r11, [%[a], #24]\n\t" + "LDR r12, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[7] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[8] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[9] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[10] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[11] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[12] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #48]\n\t" + /* A[0] * B[13] */ + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[12] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[11] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[10] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[9] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[8] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[6] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[5] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[4] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[3] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[2] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[1] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[0] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #52]\n\t" + /* A[14] * B[0] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[1] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[2] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[3] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[4] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[5] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[6] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[7] */ + "LDR r11, [%[a], #28]\n\t" + "LDR r12, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[8] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[9] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[10] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[11] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[12] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[13] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[14] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #56]\n\t" + /* A[0] * B[15] */ + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[14] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[13] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[12] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[11] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[10] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[9] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[8] */ + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[7] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[6] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[5] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[4] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[3] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[2] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[1] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[0] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #60]\n\t" + /* A[16] * B[0] */ + "LDR r8, [%[a], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[1] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[2] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[3] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[4] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[5] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[6] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[7] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[8] */ + "LDR r11, [%[a], #32]\n\t" + "LDR r12, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[9] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[10] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[11] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[12] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[13] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[14] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[15] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[16] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #64]\n\t" + /* A[1] * B[16] */ + "LDR r8, [%[a], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[15] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[14] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[13] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[12] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[11] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[10] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[8] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[7] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[6] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[5] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[4] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[3] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[2] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[16] * B[1] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #68]\n\t" + /* A[16] * B[2] */ + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[3] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[4] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[5] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[6] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[7] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[8] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[9] */ + "LDR r11, [%[a], #36]\n\t" + "LDR r12, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[10] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[11] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[12] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[13] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[14] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[15] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[16] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #72]\n\t" + /* A[3] * B[16] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[15] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[14] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[13] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[12] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[11] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[10] */ + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[9] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[8] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[7] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[6] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[5] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[4] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[16] * B[3] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #76]\n\t" + /* A[16] * B[4] */ + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[5] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[6] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[7] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[8] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[9] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[10] */ + "LDR r11, [%[a], #40]\n\t" + "LDR r12, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[11] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[12] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[13] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[14] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[15] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[16] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #80]\n\t" + /* A[5] * B[16] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[15] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[14] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[13] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[12] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[10] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[9] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[8] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[7] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[6] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[16] * B[5] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #84]\n\t" + /* A[16] * B[6] */ + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[7] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[8] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[9] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[10] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[11] */ + "LDR r11, [%[a], #44]\n\t" + "LDR r12, [%[b], #44]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[12] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[13] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[14] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[15] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[16] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #88]\n\t" + /* A[7] * B[16] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[15] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[14] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[13] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[12] */ + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[11] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[10] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[9] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[8] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[16] * B[7] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #92]\n\t" + /* A[16] * B[8] */ + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[9] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[10] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[11] */ + "LDR r8, [%[a], #52]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[12] */ + "LDR r11, [%[a], #48]\n\t" + "LDR r12, [%[b], #48]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[13] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[14] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[15] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[16] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #96]\n\t" + /* A[9] * B[16] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[15] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[14] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[13] */ + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[12] */ + "LDR r8, [%[a], #52]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[11] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[10] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[16] * B[9] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #100]\n\t" + /* A[16] * B[10] */ + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[11] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[12] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[13] */ + "LDR r11, [%[a], #52]\n\t" + "LDR r12, [%[b], #52]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[14] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[15] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[16] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #104]\n\t" + /* A[11] * B[16] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[15] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[14] */ + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[13] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[12] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[16] * B[11] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #108]\n\t" + /* A[16] * B[12] */ + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[13] */ + "LDR r8, [%[a], #60]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[14] */ + "LDR r11, [%[a], #56]\n\t" + "LDR r12, [%[b], #56]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[15] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[16] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #112]\n\t" + /* A[13] * B[16] */ + "LDR r8, [%[a], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[15] */ + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[14] */ + "LDR r8, [%[a], #60]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[16] * B[13] */ + "LDR r8, [%[a], #64]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #116]\n\t" + /* A[16] * B[14] */ + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[15] */ + "LDR r11, [%[a], #60]\n\t" + "LDR r12, [%[b], #60]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[16] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #64]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #120]\n\t" + /* A[15] * B[16] */ + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[16] * B[15] */ + "LDR r8, [%[a], #64]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #124]\n\t" + /* A[16] * B[16] */ + "UMLAL r5, r3, r8, r9\n\t" + "STR r5, [%[r], #128]\n\t" + "STR r3, [%[r], #132]\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3}\n\t" + "STM %[r]!, {r3}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Square a and put result in r. (r = a * a) * * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) +static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #136\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #64\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x88\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_521_sqr_17_outer_%=:\n\t" + "SUBS r3, r5, #0x40\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_521_sqr_17_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_521_sqr_17_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_521_sqr_17_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_521_sqr_17_op_done_%=\n\t" + "\n" + "L_sp_521_sqr_17_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_521_sqr_17_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_521_sqr_17_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #68\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_521_sqr_17_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_521_sqr_17_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_521_sqr_17_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_521_sqr_17_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_521_sqr_17_inner_%=\n\t" +#endif + "\n" + "L_sp_521_sqr_17_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_521_sqr_17_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #128\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_521_sqr_17_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "LDM sp!, {r6, r7}\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SUB r5, r5, #0x8\n\t" + "\n" + "L_sp_521_sqr_17_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_521_sqr_17_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #132\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #136\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_521_sqr_17_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } +#else +/* Square a and put result in r. (r = a * a) + * + * r A single precision integer. + * a A single precision integer. + */ +static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "SUB sp, sp, #0x44\n\t" + /* A[0] * A[0] */ + "LDR r10, [%[a]]\n\t" + "UMULL r8, r3, r10, r10\n\t" + "MOV r4, #0x0\n\t" + "STR r8, [sp]\n\t" + /* A[0] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #4]\n\t" + /* A[0] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #8]\n\t" + /* A[0] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [sp, #12]\n\t" + /* A[0] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[1] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[2] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #16]\n\t" + /* A[0] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #20]\n\t" + /* A[0] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #24]\n\t" + /* A[0] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #28]\n\t" + /* A[0] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #32]\n\t" + /* A[0] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #36]\n\t" + /* A[0] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #40]\n\t" + /* A[0] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #44]\n\t" + /* A[0] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #48]\n\t" + /* A[0] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #52]\n\t" + /* A[0] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #56]\n\t" + /* A[0] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #60]\n\t" + /* A[0] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #64]\n\t" + /* A[1] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[2] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #68]\n\t" + /* A[2] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[3] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #72]\n\t" + /* A[3] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[4] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #76]\n\t" + /* A[4] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[5] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #80]\n\t" + /* A[5] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[6] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #84]\n\t" + /* A[6] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[7] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #88]\n\t" + /* A[7] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[8] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #92]\n\t" + /* A[8] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[9] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[12] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #96]\n\t" + /* A[9] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[10] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[12] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #100]\n\t" + /* A[10] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[11] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[12] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[13] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #104]\n\t" + /* A[11] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[12] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[13] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #108]\n\t" + /* A[12] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[13] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[14] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [%[r], #112]\n\t" + /* A[13] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #56]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #116]\n\t" + /* A[14] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #56]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #120]\n\t" + /* A[15] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "LDR r12, [%[a], #60]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [%[r], #124]\n\t" + /* A[16] * A[16] */ + "LDR r10, [%[a], #64]\n\t" + "UMLAL r4, r2, r10, r10\n\t" + "STR r4, [%[r], #128]\n\t" + "STR r2, [%[r], #132]\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2}\n\t" + "STM %[r]!, {r2}\n\t" + : [r] "+r" (r), [a] "+r" (a) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * @@ -32817,39 +52604,45 @@ SP_NOINLINE static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #68\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x40\n\t" + "\n" + "L_sp_521_add_17_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_521_add_17_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_521_add_17_word_%=\n\t" +#endif + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a], {r4}\n\t" + "LDM %[b], {r8}\n\t" + "ADCS r4, r4, r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r4, #0x0\n\t" + "ADC %[r], r4, #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -32859,64 +52652,52 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r6, [%[b]]\n\t" - "adcs r4, r4, r6\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3}\n\t" + "LDM %[b]!, {r7}\n\t" + "ADCS r3, r3, r7\n\t" + "STM %[r]!, {r3}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -32927,37 +52708,43 @@ SP_NOINLINE static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "add r6, r6, #68\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x40\n\t" + "\n" + "L_sp_521_sub_17_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_521_sub_17_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_521_sub_17_word_%=\n\t" +#endif + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3}\n\t" + "LDM %[b]!, {r7}\n\t" + "SBCS r3, r3, r7\n\t" + "STM %[r]!, {r3}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -32967,63 +52754,51 @@ SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r6, [%[b]]\n\t" - "sbcs r4, r4, r6\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3}\n\t" + "LDM %[b]!, {r7}\n\t" + "SBCS r3, r3, r7\n\t" + "STM %[r]!, {r3}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -33055,14 +52830,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -33240,6 +53015,7 @@ static int sp_521_point_to_ecc_point_17(const sp_point_521* p, ecc_point* pm) return err; } +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -33248,322 +53024,727 @@ static int sp_521_point_to_ecc_point_17(const sp_point_521* p, ecc_point* pm) * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #68\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_521_cond_sub_17_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_521_cond_sub_17_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_521_cond_sub_17_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_521_cond_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r8, [%[b]]\n\t" + "AND r8, r8, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "STR r6, [%[r]]\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ /* Reduce the number back to 521 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_521_mont_reduce_17(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_521_mont_reduce_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - (void)mp; - (void)m; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; __asm__ __volatile__ ( - "sub sp, sp, #68\n\t" - "mov r12, sp\n\t" - "add r14, %[a], #64\n\t" - "ldm r14!, {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10}\n\t" - "lsr r1, r1, #9\n\t" - "orr r1, r1, r2, lsl #23\n\t" - "lsr r2, r2, #9\n\t" - "orr r2, r2, r3, lsl #23\n\t" - "lsr r3, r3, #9\n\t" - "orr r3, r3, r4, lsl #23\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r6, lsl #23\n\t" - "lsr r6, r6, #9\n\t" - "orr r6, r6, r7, lsl #23\n\t" - "lsr r7, r7, #9\n\t" - "orr r7, r7, r8, lsl #23\n\t" - "lsr r8, r8, #9\n\t" - "orr r8, r8, r9, lsl #23\n\t" - "lsr r9, r9, #9\n\t" - "orr r9, r9, r10, lsl #23\n\t" - "stm r12!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - "mov r1, r10\n\t" - "ldm r14, {r2, r3, r4, r5, r6, r7, r8}\n\t" - "lsr r1, r1, #9\n\t" - "orr r1, r1, r2, lsl #23\n\t" - "lsr r2, r2, #9\n\t" - "orr r2, r2, r3, lsl #23\n\t" - "lsr r3, r3, #9\n\t" - "orr r3, r3, r4, lsl #23\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r6, lsl #23\n\t" - "lsr r6, r6, #9\n\t" - "orr r6, r6, r7, lsl #23\n\t" - "lsr r7, r7, #9\n\t" - "orr r7, r7, r8, lsl #23\n\t" - "lsr r8, r8, #9\n\t" - "stm r12!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" - "mov r14, sp\n\t" - "ldm %[a], {r1, r2, r3, r4, r5, r6}\n\t" - "ldm r14!, {r7, r8, r9, r10, r11, r12}\n\t" - "adds r1, r1, r7\n\t" - "adcs r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r12\n\t" - "stm %[a]!, {r1, r2, r3, r4, r5, r6}\n\t" - "ldm %[a], {r1, r2, r3, r4, r5, r6}\n\t" - "ldm r14!, {r7, r8, r9, r10, r11, r12}\n\t" - "adcs r1, r1, r7\n\t" - "adcs r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r12\n\t" - "stm %[a]!, {r1, r2, r3, r4, r5, r6}\n\t" - "ldm %[a], {r1, r2, r3, r4, r5}\n\t" - "ldm r14!, {r7, r8, r9, r10, r11}\n\t" - "mov r14, #0x1ff\n\t" - "and r5, r5, r14\n\t" - "adcs r1, r1, r7\n\t" - "adcs r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r10\n\t" - "adcs r5, r5, r11\n\t" - "lsr r12, r5, #9\n\t" - "and r5, r5, r14\n\t" - "stm %[a]!, {r1, r2, r3, r4, r5}\n\t" - "sub %[a], %[a], #68\n\t" - "mov r11, #0\n\t" - "ldm %[a], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - "adds r1, r1, r12\n\t" - "adcs r2, r2, r11\n\t" - "adcs r3, r3, r11\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "adcs r9, r9, r11\n\t" - "stm %[a]!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" - "ldm %[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" - "adcs r1, r1, r11\n\t" - "adcs r2, r2, r11\n\t" - "adcs r3, r3, r11\n\t" - "adcs r4, r4, r11\n\t" - "adcs r5, r5, r11\n\t" - "adcs r6, r6, r11\n\t" - "adcs r7, r7, r11\n\t" - "adcs r8, r8, r11\n\t" - "stm %[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" - "add sp, sp, #68\n\t" - "sub %[a], %[a], #68\n\t" + "SUB sp, sp, #0x44\n\t" + "MOV r12, sp\n\t" + /* Shift top down by 9 bits */ + "ADD lr, %[a], #0x40\n\t" + /* 0-7 */ + "LDM lr!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "LSR r1, r1, #9\n\t" + "ORR r1, r1, r2, lsl #23\n\t" + "LSR r2, r2, #9\n\t" + "ORR r2, r2, r3, lsl #23\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r4, lsl #23\n\t" + "LSR r4, r4, #9\n\t" + "ORR r4, r4, r5, lsl #23\n\t" + "LSR r5, r5, #9\n\t" + "ORR r5, r5, r6, lsl #23\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r8, lsl #23\n\t" + "LSR r8, r8, #9\n\t" + "ORR r8, r8, r9, lsl #23\n\t" + "STM r12!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "MOV r1, r9\n\t" + /* 8-16 */ + "LDM lr!, {r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "LSR r1, r1, #9\n\t" + "ORR r1, r1, r2, lsl #23\n\t" + "LSR r2, r2, #9\n\t" + "ORR r2, r2, r3, lsl #23\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r4, lsl #23\n\t" + "LSR r4, r4, #9\n\t" + "ORR r4, r4, r5, lsl #23\n\t" + "LSR r5, r5, #9\n\t" + "ORR r5, r5, r6, lsl #23\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r8, lsl #23\n\t" + "LSR r8, r8, #9\n\t" + "ORR r8, r8, r9, lsl #23\n\t" + "LSR r9, r9, #9\n\t" + "STM r12!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + /* Add top to bottom */ + /* 0-5 */ + "LDM %[a], {r1, r2, r3, r4, r5, r6}\n\t" + "LDM sp!, {r7, r8, r9, r10, r11, r12}\n\t" + "ADDS r1, r1, r7\n\t" + "ADCS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADCS r4, r4, r10\n\t" + "ADCS r5, r5, r11\n\t" + "ADCS r6, r6, r12\n\t" + "STM %[a]!, {r1, r2, r3, r4, r5, r6}\n\t" + /* 6-11 */ + "LDM %[a], {r1, r2, r3, r4, r5, r6}\n\t" + "LDM sp!, {r7, r8, r9, r10, r11, r12}\n\t" + "ADCS r1, r1, r7\n\t" + "ADCS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADCS r4, r4, r10\n\t" + "ADCS r5, r5, r11\n\t" + "ADCS r6, r6, r12\n\t" + "STM %[a]!, {r1, r2, r3, r4, r5, r6}\n\t" + /* 12-16 */ + "LDM %[a], {r1, r2, r3, r4, r5}\n\t" + "LDM sp!, {r7, r8, r9, r10, r11}\n\t" + "MOV lr, #0x1ff\n\t" + "AND r5, r5, lr\n\t" + "ADCS r1, r1, r7\n\t" + "ADCS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADCS r4, r4, r10\n\t" + "ADCS r5, r5, r11\n\t" + "LSR r12, r5, #9\n\t" + "AND r5, r5, lr\n\t" + "STM %[a]!, {r1, r2, r3, r4, r5}\n\t" + "SUB %[a], %[a], #0x44\n\t" + /* Add overflow */ + /* 0-8 */ + "LDM %[a], {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + "ADDS r1, r1, r12\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4, r5, r6, r7, r8, r9}\n\t" + /* 9-16 */ + "LDM %[a], {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4, r5, r6, r7, r8}\n\t" : [a] "+r" (a) : - : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14" + : "memory", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - + (void)m_p; + (void)mp_p; } +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 521 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #68\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "sub r14, r11, #4\n\t" - "cmp r10, r14\n\t" - "bne L_521_mont_reduce_17_nomask\n\t" - "mov r8, #0x1ff\n\t" - "and %[mp], %[mp], r8\n\t" - "L_521_mont_reduce_17_nomask:\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #64\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" + "MUL r10, %[mp], r4\n\t" + "CMP r11, #0x40\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" + "BNE L_sp_521_mont_reduce_order_17_nomask_%=\n\t" #else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "BNE.N L_sp_521_mont_reduce_order_17_nomask_%=\n\t" +#endif + "MOV r9, #0x1ff\n\t" + "AND r10, r10, r9\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + "STR r4, [%[a]]\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+16] += m[16] * mu */ - "mov r5, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[16] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[16] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r4\n\t" - "adcs r8, r8, r5\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - "mov r4, #0\n\t" - /* Next word in a */ - "sub r10, r10, #60\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "LDR r12, [%[a], #68]\n\t" + "ADCS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_521_mont_reduce_order_17_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "sub r10, r10, #4\n\t" - "ldr r4, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "ldr r5, [r10]\n\t" - "lsr r4, r4, #9\n\t" - "orr r4, r4, r5, lsl #23\n\t" - "str r4, [r10], #4\n\t" - "ldr r4, [r10]\n\t" - "lsr r5, r5, #9\n\t" - "orr r5, r5, r4, lsl #23\n\t" - "str r5, [r10], #4\n\t" - "lsr r4, r4, #9\n\t" - "str r4, [r10]\n\t" - "lsr %[ca], r4, #9\n\t" - "sub %[a], r10, #64\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_521_mont_reduce_order_17_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "SUB %[a], %[a], #0x4\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #4]\n\t" + "LDR r6, [%[a], #8]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #8]\n\t" + "LDR r7, [%[a], #12]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #12]\n\t" + "LDR r6, [%[a], #16]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #16]\n\t" + "LDR r7, [%[a], #20]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #20]\n\t" + "LDR r6, [%[a], #24]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #24]\n\t" + "LDR r7, [%[a], #28]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #28]\n\t" + "LDR r6, [%[a], #32]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #32]\n\t" + "LDR r7, [%[a], #36]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #36]\n\t" + "LDR r6, [%[a], #40]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #40]\n\t" + "LDR r7, [%[a], #44]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #44]\n\t" + "LDR r6, [%[a], #48]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #48]\n\t" + "LDR r7, [%[a], #52]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #52]\n\t" + "LDR r6, [%[a], #56]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #56]\n\t" + "LDR r7, [%[a], #60]\n\t" + "LSR r6, r6, #9\n\t" + "ORR r6, r6, r7, lsl #23\n\t" + "STR r6, [%[a], #60]\n\t" + "LDR r6, [%[a], #64]\n\t" + "LSR r7, r7, #9\n\t" + "ORR r7, r7, r6, lsl #23\n\t" + "STR r7, [%[a], #64]\n\t" + "LSR r6, r6, #9\n\t" + "STR r6, [%[a], #68]\n\t" + "LSR r3, r6, #9\n\t" + "ADD %[a], %[a], #0x4\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - ca); + sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp); } +#else +/* Reduce the number back to 521 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_521_mont_reduce_order_17(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + "CMP r4, #0x40\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_521_mont_reduce_order_17_nomask_%=\n\t" +#else + "BNE.N L_sp_521_mont_reduce_order_17_nomask_%=\n\t" +#endif + "MOV r12, #0x1ff\n\t" + "AND lr, lr, r12\n\t" + "\n" + "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + "STR r6, [%[a]]\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #68]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #64]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #68]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x44\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_521_mont_reduce_order_17_word_%=\n\t" +#else + "BLT.N L_sp_521_mont_reduce_order_17_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "SUB %[a], %[a], #0x4\n\t" + "LDR r12, [%[a]]\n\t" + "LDR r3, [%[a], #4]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #4]\n\t" + "LDR r12, [%[a], #8]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #8]\n\t" + "LDR r3, [%[a], #12]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #12]\n\t" + "LDR r12, [%[a], #16]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #16]\n\t" + "LDR r3, [%[a], #20]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #20]\n\t" + "LDR r12, [%[a], #24]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #24]\n\t" + "LDR r3, [%[a], #28]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #28]\n\t" + "LDR r12, [%[a], #32]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #32]\n\t" + "LDR r3, [%[a], #36]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #36]\n\t" + "LDR r12, [%[a], #40]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #40]\n\t" + "LDR r3, [%[a], #44]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #44]\n\t" + "LDR r12, [%[a], #48]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #48]\n\t" + "LDR r3, [%[a], #52]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #52]\n\t" + "LDR r12, [%[a], #56]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #56]\n\t" + "LDR r3, [%[a], #60]\n\t" + "LSR r12, r12, #9\n\t" + "ORR r12, r12, r3, lsl #23\n\t" + "STR r12, [%[a], #60]\n\t" + "LDR r12, [%[a], #64]\n\t" + "LSR r3, r3, #9\n\t" + "ORR r3, r3, r12, lsl #23\n\t" + "STR r3, [%[a], #64]\n\t" + "LSR r12, r12, #9\n\t" + "STR r12, [%[a], #68]\n\t" + "LSR r5, r12, #9\n\t" + "ADD %[a], %[a], #0x4\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_521_cond_sub_17(a - 17, a, m, (sp_digit)0 - mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -33710,44 +53891,230 @@ static void sp_521_mont_inv_17(sp_digit* r, const sp_digit* a, sp_digit* td) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_521_cmp_17(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #64\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x40\n\t" + "\n" + "L_sp_521_cmp_17_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_521_cmp_17_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Normalize the values in each word to 32. @@ -33803,82 +54170,78 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p, * b Second number to add in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_521_mont_add_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4}\n\t" - "ldm %[b]!, {r8}\n\t" - "adcs r4, r4, r8\n\t" - "mov r14, #0x1ff\n\t" - "lsr r12, r4, #9\n\t" - "and r4, r4, r14\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" - "mov r14, #0\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adds r4, r4, r12\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4}\n\t" - "adcs r4, r4, r14\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" + "MOV r3, #0x0\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "ADDS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8}\n\t" + "LDM %[b]!, {r4}\n\t" + "ADCS r8, r8, r4\n\t" + "MOV r12, #0x1ff\n\t" + "LSR r3, r8, #9\n\t" + "AND r8, r8, r12\n\t" + "STM %[r]!, {r8}\n\t" + "SUB %[r], %[r], #0x44\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r3\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4}\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[r]!, {r4}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); + (void)m_p; } /* Double a Montgomery form number (r = a + a % m). @@ -33887,79 +54250,68 @@ SP_NOINLINE static void sp_521_mont_add_17(sp_digit* r, const sp_digit* a, const * a Number to double in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_521_mont_dbl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r8, #0\n\t" - "ldm %[a]!, {r2, r3, r4, r5, r6, r7}\n\t" - "adds r2, r2, r2\n\t" - "adcs r3, r3, r3\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "str r2, [%[r], #0]\n\t" - "str r3, [%[r], #4]\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" - "str r6, [%[r], #16]\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r2, r3, r4, r5, r6, r7}\n\t" - "adcs r2, r2, r2\n\t" - "adcs r3, r3, r3\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "str r2, [%[r], #0]\n\t" - "str r3, [%[r], #4]\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #12]\n\t" - "str r6, [%[r], #16]\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r2, r3, r4, r5, r6}\n\t" - "adcs r2, r2, r2\n\t" - "adcs r3, r3, r3\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "mov r9, #0x1ff\n\t" - "lsr r8, r6, #9\n\t" - "and r6, r6, r9\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6}\n\t" - "sub %[r], %[r], #68\n\t" - "mov r9, #0\n\t" - "ldm %[r], {r2, r3, r4, r5, r6, r7}\n\t" - "adds r2, r2, r8\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r9\n\t" - "adcs r7, r7, r9\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6, r7}\n\t" - "ldm %[r], {r2, r3, r4, r5, r6, r7}\n\t" - "adcs r2, r2, r9\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r9\n\t" - "adcs r7, r7, r9\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6, r7}\n\t" - "ldm %[r], {r2, r3, r4, r5, r6}\n\t" - "adcs r2, r2, r9\n\t" - "adcs r3, r3, r9\n\t" - "adcs r4, r4, r9\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r9\n\t" - "stm %[r]!, {r2, r3, r4, r5, r6}\n\t" - "sub %[r], %[r], #68\n\t" - "sub %[a], %[a], #68\n\t" + "MOV r2, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4}\n\t" + "ADCS r4, r4, r4\n\t" + "MOV r3, #0x1ff\n\t" + "LSR r2, r4, #9\n\t" + "AND r4, r4, r3\n\t" + "STM %[r]!, {r4}\n\t" + "SUB %[r], %[r], #0x44\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r2\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, #0x0\n\t" + "ADCS r5, r5, #0x0\n\t" + "ADCS r6, r6, #0x0\n\t" + "ADCS r7, r7, #0x0\n\t" + "ADCS r8, r8, #0x0\n\t" + "ADCS r9, r9, #0x0\n\t" + "ADCS r10, r10, #0x0\n\t" + "ADCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4}\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[r]!, {r4}\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3" ); + (void)m_p; } /* Triple a Montgomery form number (r = a + a + a % m). @@ -33968,110 +54320,88 @@ SP_NOINLINE static void sp_521_mont_dbl_17(sp_digit* r, const sp_digit* a, const * a Number to triple in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_521_mont_tpl_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4}\n\t" - "adcs r4, r4, r4\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" - "sub %[a], %[a], #68\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8}\n\t" - "ldm %[a]!, {r4}\n\t" - "adcs r8, r8, r4\n\t" - "mov r14, #0x1ff\n\t" - "lsr r12, r8, #9\n\t" - "and r8, r8, r14\n\t" - "stm %[r]!, {r8}\n\t" - "sub %[r], %[r], #68\n\t" - "mov r14, #0\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adds r4, r4, r12\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "adcs r4, r4, r14\n\t" - "adcs r5, r5, r14\n\t" - "adcs r6, r6, r14\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4}\n\t" - "adcs r4, r4, r14\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" + "MOV r2, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4}\n\t" + "ADCS r4, r4, r4\n\t" + "STM %[r]!, {r4}\n\t" + "SUB %[r], %[r], #0x44\n\t" + "SUB %[a], %[a], #0x44\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4}\n\t" + "LDM %[a]!, {r8}\n\t" + "ADCS r4, r4, r8\n\t" + "MOV r3, #0x1ff\n\t" + "LSR r2, r4, #9\n\t" + "AND r4, r4, r3\n\t" + "STM %[r]!, {r4}\n\t" + "SUB %[r], %[r], #0x44\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r2\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4}\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[r]!, {r4}\n\t" : [r] "+r" (r), [a] "+r" (a) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r2", "r3" ); + (void)m_p; } /* Subtract two Montgomery form numbers (r = a - b % m). @@ -34081,199 +54411,155 @@ SP_NOINLINE static void sp_521_mont_tpl_17(sp_digit* r, const sp_digit* a, const * b Number to subtract with in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_521_mont_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_521_mont_sub_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { - (void)m; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "subs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4}\n\t" - "ldm %[b]!, {r8}\n\t" - "sbcs r4, r4, r8\n\t" - "mov r14, #0x1ff\n\t" - "asr r12, r4, #9\n\t" - "and r4, r4, r14\n\t" - "neg r12, r12\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" - "mov r14, #0\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "subs r4, r4, r12\n\t" - "sbcs r5, r5, r14\n\t" - "sbcs r6, r6, r14\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "sbcs r4, r4, r14\n\t" - "sbcs r5, r5, r14\n\t" - "sbcs r6, r6, r14\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "sbcs r4, r4, r14\n\t" - "sbcs r5, r5, r14\n\t" - "sbcs r6, r6, r14\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "sbcs r4, r4, r14\n\t" - "sbcs r5, r5, r14\n\t" - "sbcs r6, r6, r14\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4}\n\t" - "sbcs r4, r4, r14\n\t" - "stm %[r]!, {r4}\n\t" - "sub %[r], %[r], #68\n\t" + "MOV r3, #0x0\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "SUBS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8, r9, r10, r11}\n\t" + "LDM %[b]!, {r4, r5, r6, r7}\n\t" + "SBCS r8, r8, r4\n\t" + "SBCS r9, r9, r5\n\t" + "SBCS r10, r10, r6\n\t" + "SBCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r8}\n\t" + "LDM %[b]!, {r4}\n\t" + "SBCS r8, r8, r4\n\t" + "MOV r12, #0x1ff\n\t" + "ASR r3, r8, #9\n\t" + "AND r8, r8, r12\n\t" + "neg r3, r3\n\t" + "STM %[r]!, {r8}\n\t" + "SUB %[r], %[r], #0x44\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r4, r4, r3\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "SBCS r4, r4, #0x0\n\t" + "SBCS r5, r5, #0x0\n\t" + "SBCS r6, r6, #0x0\n\t" + "SBCS r7, r7, #0x0\n\t" + "SBCS r8, r8, #0x0\n\t" + "SBCS r9, r9, #0x0\n\t" + "SBCS r10, r10, #0x0\n\t" + "SBCS r11, r11, #0x0\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[r], {r4}\n\t" + "SBCS r4, r4, #0x0\n\t" + "STM %[r]!, {r4}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); + (void)m_p; } -#define sp_521_mont_sub_lower_17 sp_521_mont_sub_17 -/* Conditionally add a and b using the mask m. - * m is -1 to add and 0 when not. - * - * r A single precision number representing conditional add result. - * a A single precision number to add with. - * b A single precision number to add. - * m Mask value to apply. - */ -SP_NOINLINE static sp_digit sp_521_cond_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static void sp_521_rshift1_17(sp_digit* r_p, const sp_digit* a_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "mov r5, #68\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" -#else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" - ); - - return c; -} - -static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a) -{ - __asm__ __volatile__ ( - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" - "lsr r2, r2, #1\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r3, [%[r], #40]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r4, [%[r], #44]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r2, [%[r], #48]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r3, [%[r], #52]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #64]\n\t" - "str r4, [%[r], #56]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "str r2, [%[r], #60]\n\t" - "str r3, [%[r], #64]\n\t" + "LDM %[a], {r2, r3}\n\t" + "LSR r2, r2, #1\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #8]\n\t" + "STR r2, [%[r]]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #12]\n\t" + "STR r3, [%[r], #4]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #16]\n\t" + "STR r4, [%[r], #8]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #20]\n\t" + "STR r2, [%[r], #12]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #24]\n\t" + "STR r3, [%[r], #16]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #28]\n\t" + "STR r4, [%[r], #20]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #32]\n\t" + "STR r2, [%[r], #24]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #36]\n\t" + "STR r3, [%[r], #28]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #40]\n\t" + "STR r4, [%[r], #32]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #44]\n\t" + "STR r2, [%[r], #36]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #48]\n\t" + "STR r3, [%[r], #40]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #52]\n\t" + "STR r4, [%[r], #44]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #56]\n\t" + "STR r2, [%[r], #48]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #60]\n\t" + "STR r3, [%[r], #52]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #64]\n\t" + "STR r4, [%[r], #56]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "STR r2, [%[r], #60]\n\t" + "STR r3, [%[r], #64]\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) : "memory", "r2", "r3", "r4" ); } @@ -34284,13 +54570,14 @@ static void sp_521_rshift1_17(sp_digit* r, const sp_digit* a) * a Number to divide. * m Modulus (prime). */ -SP_NOINLINE static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m) { - sp_digit o; + sp_digit o = a[0] & 1; + + (void)m; - o = sp_521_cond_add_17(r, a, m, 0 - (a[0] & 1)); sp_521_rshift1_17(r, r); - r[16] |= o << 31; + r[16] |= o << 8; } /* Double the Montgomery form projective point p. @@ -34347,7 +54634,7 @@ static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_17(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -34469,7 +54756,7 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -34535,12 +54822,12 @@ static int sp_521_iszero_17(const sp_digit* a) static void sp_521_proj_point_add_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*17; + sp_digit* t2 = t + 4*17; + sp_digit* t3 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod); @@ -34562,17 +54849,9 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_17(t2, t2, t1, p521_mod); @@ -34591,20 +54870,31 @@ static void sp_521_proj_point_add_17(sp_point_521* r, sp_521_mont_dbl_17(t3, y, p521_mod); sp_521_mont_sub_17(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(y, y, x, p521_mod); + sp_521_mont_sub_17(y, y, x, p521_mod); sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_17(y, y, t5, p521_mod); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -34650,12 +54940,12 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*17; - ctx->t3 = t + 4*17; - ctx->t4 = t + 6*17; - ctx->t5 = t + 8*17; - ctx->t6 = t + 10*17; + ctx->t6 = t; + ctx->t1 = t + 2*17; + ctx->t2 = t + 4*17; + ctx->t3 = t + 6*17; + ctx->t4 = t + 8*17; + ctx->t5 = t + 10*17; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -34762,7 +55052,7 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_17(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -34775,22 +55065,28 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -35120,8 +55416,6 @@ static int sp_521_ecc_mulmod_fast_17(sp_point_521* r, const sp_point_521* g, con } #ifdef FP_ECC -#define sp_521_mont_dbl_lower_17 sp_521_mont_dbl_17 -#define sp_521_mont_tpl_lower_17 sp_521_mont_tpl_17 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -35160,7 +55454,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -35169,8 +55463,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -35190,7 +55484,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_17(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_17(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_17(a, t1, p521_mod); + sp_521_mont_tpl_17(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_17(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_17(b, t1, x, p521_mod, p521_mp_mod); @@ -35199,8 +55493,8 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i, sp_521_mont_dbl_17(t2, b, p521_mod); sp_521_mont_sub_17(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_17(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_17(b, t2, p521_mod); + sp_521_mont_sub_17(t2, b, x, p521_mod); + sp_521_mont_dbl_17(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_17(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -35256,12 +55550,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*17; - sp_digit* t3 = t + 4*17; - sp_digit* t4 = t + 6*17; - sp_digit* t5 = t + 8*17; - sp_digit* t6 = t + 10*17; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*17; + sp_digit* t6 = t + 4*17; + sp_digit* t1 = t + 6*17; + sp_digit* t4 = t + 8*17; + sp_digit* t5 = t + 10*17; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -35277,13 +55571,9 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, sp_521_proj_point_dbl_17(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_17(t2, t2, p->x, p521_mod); @@ -35292,33 +55582,40 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_17(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(x, t1, t5, p521_mod); - sp_521_mont_dbl_17(t1, t3, p521_mod); - sp_521_mont_sub_17(x, x, t1, p521_mod); + sp_521_mont_sqr_17(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_17(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_17(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(t2, t2, t1, p521_mod); + sp_521_mont_dbl_17(t5, t3, p521_mod); + sp_521_mont_sub_17(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_17(t3, t3, x, p521_mod); + sp_521_mont_sub_17(t3, t3, x, p521_mod); sp_521_mont_mul_17(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_17(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_17(y, t3, t5, p521_mod); + sp_521_mont_mul_17(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_17(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 17; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 17; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 17; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 17; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 17; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -36306,7 +56603,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -38410,7 +58707,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -38469,65 +58766,41 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, * * a A single precision integer. */ -SP_NOINLINE static void sp_521_add_one_17(sp_digit* a) +static void sp_521_add_one_17(sp_digit* a_p) { + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r2, #1\n\t" - "ldr r1, [%[a], #0]\n\t" - "adds r1, r1, r2\n\t" - "mov r2, #0\n\t" - "str r1, [%[a], #0]\n\t" - "ldr r1, [%[a], #4]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #4]\n\t" - "ldr r1, [%[a], #8]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #8]\n\t" - "ldr r1, [%[a], #12]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #12]\n\t" - "ldr r1, [%[a], #16]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #16]\n\t" - "ldr r1, [%[a], #20]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #20]\n\t" - "ldr r1, [%[a], #24]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #24]\n\t" - "ldr r1, [%[a], #28]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #28]\n\t" - "ldr r1, [%[a], #32]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #32]\n\t" - "ldr r1, [%[a], #36]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #36]\n\t" - "ldr r1, [%[a], #40]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #40]\n\t" - "ldr r1, [%[a], #44]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #44]\n\t" - "ldr r1, [%[a], #48]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #48]\n\t" - "ldr r1, [%[a], #52]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #52]\n\t" - "ldr r1, [%[a], #56]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #56]\n\t" - "ldr r1, [%[a], #60]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #60]\n\t" - "ldr r1, [%[a], #64]\n\t" - "adcs r1, r1, r2\n\t" - "str r1, [%[a], #64]\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADDS r1, r1, #0x1\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1, r2, r3, r4}\n\t" + "ADCS r1, r1, #0x0\n\t" + "ADCS r2, r2, #0x0\n\t" + "ADCS r3, r3, #0x0\n\t" + "ADCS r4, r4, #0x0\n\t" + "STM %[a]!, {r1, r2, r3, r4}\n\t" + "LDM %[a], {r1}\n\t" + "ADCS r1, r1, #0x0\n\t" + "STM %[a]!, {r1}\n\t" + : [a] "+r" (a) : - : [a] "r" (a) - : "memory", "r1", "r2" + : "memory", "r1", "r2", "r3", "r4" ); } @@ -38623,7 +58896,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + (void)heap; @@ -38631,7 +58904,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -38899,427 +59172,436 @@ int sp_ecc_secret_gen_521_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv, #endif /* HAVE_ECC_DHE */ #if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -SP_NOINLINE static void sp_521_rshift_17(sp_digit* r, const sp_digit* a, byte n) +static void sp_521_rshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) { - __asm__ __volatile__ ( - "mov r6, #32\n\t" - "sub r6, r6, %[n]\n\t" - "ldrd r2, r3, [%[a]]\n\t" - "lsr r2, r2, %[n]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r3, [%[r], #40]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r4, [%[r], #44]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r2, [%[r], #48]\n\t" - "lsl r5, r4, r6\n\t" - "lsr r4, r4, %[n]\n\t" - "orr r3, r3, r5\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r3, [%[r], #52]\n\t" - "lsl r5, r2, r6\n\t" - "lsr r2, r2, %[n]\n\t" - "orr r4, r4, r5\n\t" - "ldr r3, [%[a], #64]\n\t" - "str r4, [%[r], #56]\n\t" - "lsl r5, r3, r6\n\t" - "lsr r3, r3, %[n]\n\t" - "orr r2, r2, r5\n\t" - "strd r2, r3, [%[r], #60]\n\t" - : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" - ); -} + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; -#endif -#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -#endif -#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) -static void sp_521_lshift_17(sp_digit* r, const sp_digit* a, byte n) -{ __asm__ __volatile__ ( - "mov r6, #31\n\t" - "sub r6, r6, %[n]\n\t" - "ldr r3, [%[a], #64]\n\t" - "lsr r4, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r4, r4, r6\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r4, [%[r], #68]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r3, [%[r], #64]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r2, [%[r], #60]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r4, [%[r], #56]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r3, [%[r], #52]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r2, [%[r], #48]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r4, [%[r], #44]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r3, [%[r], #40]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r2, [%[r], #36]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r4, [%[r], #32]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r3, [%[r], #28]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r2, [%[r], #24]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r4, [%[r], #20]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r3, [%[r], #16]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #4]\n\t" - "str r2, [%[r], #12]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #0]\n\t" - "str r4, [%[r], #8]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "str r2, [%[r]]\n\t" - "str r3, [%[r], #4]\n\t" + "RSB r7, %[n], #0x20\n\t" + "LDRD r4, r5, [%[a]]\n\t" + "LSR r4, r4, %[n]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "LDR r6, [%[a], #8]\n\t" + "STR r4, [%[a]]\n\t" + "LSL r3, r6, r7\n\t" + "LSR r6, r6, %[n]\n\t" + "ORR r5, r5, r3\n\t" + "LDR r4, [%[a], #12]\n\t" + "STR r5, [%[a], #4]\n\t" + "LSL r3, r4, r7\n\t" + "LSR r4, r4, %[n]\n\t" + "ORR r6, r6, r3\n\t" + "LDR r5, [%[a], #16]\n\t" + "STR r6, [%[a], #8]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "LDR r6, [%[a], #20]\n\t" + "STR r4, [%[a], #12]\n\t" + "LSL r3, r6, r7\n\t" + "LSR r6, r6, %[n]\n\t" + "ORR r5, r5, r3\n\t" + "LDR r4, [%[a], #24]\n\t" + "STR r5, [%[a], #16]\n\t" + "LSL r3, r4, r7\n\t" + "LSR r4, r4, %[n]\n\t" + "ORR r6, r6, r3\n\t" + "LDR r5, [%[a], #28]\n\t" + "STR r6, [%[a], #20]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "LDR r6, [%[a], #32]\n\t" + "STR r4, [%[a], #24]\n\t" + "LSL r3, r6, r7\n\t" + "LSR r6, r6, %[n]\n\t" + "ORR r5, r5, r3\n\t" + "LDR r4, [%[a], #36]\n\t" + "STR r5, [%[a], #28]\n\t" + "LSL r3, r4, r7\n\t" + "LSR r4, r4, %[n]\n\t" + "ORR r6, r6, r3\n\t" + "LDR r5, [%[a], #40]\n\t" + "STR r6, [%[a], #32]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "LDR r6, [%[a], #44]\n\t" + "STR r4, [%[a], #36]\n\t" + "LSL r3, r6, r7\n\t" + "LSR r6, r6, %[n]\n\t" + "ORR r5, r5, r3\n\t" + "LDR r4, [%[a], #48]\n\t" + "STR r5, [%[a], #40]\n\t" + "LSL r3, r4, r7\n\t" + "LSR r4, r4, %[n]\n\t" + "ORR r6, r6, r3\n\t" + "LDR r5, [%[a], #52]\n\t" + "STR r6, [%[a], #44]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "LDR r6, [%[a], #56]\n\t" + "STR r4, [%[a], #48]\n\t" + "LSL r3, r6, r7\n\t" + "LSR r6, r6, %[n]\n\t" + "ORR r5, r5, r3\n\t" + "LDR r4, [%[a], #60]\n\t" + "STR r5, [%[a], #52]\n\t" + "LSL r3, r4, r7\n\t" + "LSR r4, r4, %[n]\n\t" + "ORR r6, r6, r3\n\t" + "LDR r5, [%[a], #64]\n\t" + "STR r6, [%[a], #56]\n\t" + "LSL r3, r5, r7\n\t" + "LSR r5, r5, %[n]\n\t" + "ORR r4, r4, r3\n\t" + "STRD r4, r5, [%[r], #60]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" + : "memory", "r4", "r5", "r6", "r3", "r7" ); } -static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n) +#endif +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +#endif +#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY) +static void sp_521_lshift_17(sp_digit* r_p, const sp_digit* a_p, byte n_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; + __asm__ __volatile__ ( - "mov r6, #31\n\t" - "sub r6, r6, %[n]\n\t" - "ldr r3, [%[a], #132]\n\t" - "lsr r4, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r4, r4, r6\n\t" - "ldr r2, [%[a], #128]\n\t" - "str r4, [%[r], #136]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #124]\n\t" - "str r3, [%[r], #132]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #120]\n\t" - "str r2, [%[r], #128]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #116]\n\t" - "str r4, [%[r], #124]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #112]\n\t" - "str r3, [%[r], #120]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #108]\n\t" - "str r2, [%[r], #116]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #104]\n\t" - "str r4, [%[r], #112]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #100]\n\t" - "str r3, [%[r], #108]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #96]\n\t" - "str r2, [%[r], #104]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #92]\n\t" - "str r4, [%[r], #100]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #88]\n\t" - "str r3, [%[r], #96]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #84]\n\t" - "str r2, [%[r], #92]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #80]\n\t" - "str r4, [%[r], #88]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #76]\n\t" - "str r3, [%[r], #84]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #72]\n\t" - "str r2, [%[r], #80]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #68]\n\t" - "str r4, [%[r], #76]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #64]\n\t" - "str r3, [%[r], #72]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r2, [%[r], #68]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #56]\n\t" - "str r4, [%[r], #64]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r3, [%[r], #60]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r2, [%[r], #56]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #44]\n\t" - "str r4, [%[r], #52]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r3, [%[r], #48]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r2, [%[r], #44]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #32]\n\t" - "str r4, [%[r], #40]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r3, [%[r], #36]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r2, [%[r], #32]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #20]\n\t" - "str r4, [%[r], #28]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r3, [%[r], #24]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r2, [%[r], #20]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "ldr r2, [%[a], #8]\n\t" - "str r4, [%[r], #16]\n\t" - "lsr r5, r2, #1\n\t" - "lsl r2, r2, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r3, r3, r5\n\t" - "ldr r4, [%[a], #4]\n\t" - "str r3, [%[r], #12]\n\t" - "lsr r5, r4, #1\n\t" - "lsl r4, r4, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r2, r2, r5\n\t" - "ldr r3, [%[a], #0]\n\t" - "str r2, [%[r], #8]\n\t" - "lsr r5, r3, #1\n\t" - "lsl r3, r3, %[n]\n\t" - "lsr r5, r5, r6\n\t" - "orr r4, r4, r5\n\t" - "str r3, [%[r]]\n\t" - "str r4, [%[r], #4]\n\t" + "RSB r7, %[n], #0x1f\n\t" + "LDR r5, [%[a], #64]\n\t" + "LSR r6, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r6, r6, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "STR r6, [%[r], #68]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #56]\n\t" + "STR r5, [%[r], #64]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #52]\n\t" + "STR r4, [%[r], #60]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #48]\n\t" + "STR r6, [%[r], #56]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #44]\n\t" + "STR r5, [%[r], #52]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #40]\n\t" + "STR r4, [%[r], #48]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #36]\n\t" + "STR r6, [%[r], #44]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #32]\n\t" + "STR r5, [%[r], #40]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #28]\n\t" + "STR r4, [%[r], #36]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #24]\n\t" + "STR r6, [%[r], #32]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #20]\n\t" + "STR r5, [%[r], #28]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #16]\n\t" + "STR r4, [%[r], #24]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #12]\n\t" + "STR r6, [%[r], #20]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #8]\n\t" + "STR r5, [%[r], #16]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #4]\n\t" + "STR r4, [%[r], #12]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a]]\n\t" + "STR r6, [%[r], #8]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "STR r4, [%[r]]\n\t" + "STR r5, [%[r], #4]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) : - : [r] "r" (r), [a] "r" (a), [n] "r" (n) - : "memory", "r2", "r3", "r4", "r5", "r6" + : "memory", "r4", "r5", "r6", "r3", "r7" + ); +} + +static void sp_521_lshift_34(sp_digit* r_p, const sp_digit* a_p, byte n_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register byte n asm ("r2") = (byte)n_p; + + __asm__ __volatile__ ( + "RSB r7, %[n], #0x1f\n\t" + "LDR r5, [%[a], #132]\n\t" + "LSR r6, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r6, r6, r7\n\t" + "LDR r4, [%[a], #128]\n\t" + "STR r6, [%[r], #136]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #124]\n\t" + "STR r5, [%[r], #132]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #120]\n\t" + "STR r4, [%[r], #128]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #116]\n\t" + "STR r6, [%[r], #124]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #112]\n\t" + "STR r5, [%[r], #120]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #108]\n\t" + "STR r4, [%[r], #116]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #104]\n\t" + "STR r6, [%[r], #112]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #100]\n\t" + "STR r5, [%[r], #108]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #96]\n\t" + "STR r4, [%[r], #104]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #92]\n\t" + "STR r6, [%[r], #100]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #88]\n\t" + "STR r5, [%[r], #96]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #84]\n\t" + "STR r4, [%[r], #92]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #80]\n\t" + "STR r6, [%[r], #88]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #76]\n\t" + "STR r5, [%[r], #84]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #72]\n\t" + "STR r4, [%[r], #80]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #68]\n\t" + "STR r6, [%[r], #76]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #64]\n\t" + "STR r5, [%[r], #72]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #60]\n\t" + "STR r4, [%[r], #68]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #56]\n\t" + "STR r6, [%[r], #64]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #52]\n\t" + "STR r5, [%[r], #60]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #48]\n\t" + "STR r4, [%[r], #56]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #44]\n\t" + "STR r6, [%[r], #52]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #40]\n\t" + "STR r5, [%[r], #48]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #36]\n\t" + "STR r4, [%[r], #44]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #32]\n\t" + "STR r6, [%[r], #40]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #28]\n\t" + "STR r5, [%[r], #36]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #24]\n\t" + "STR r4, [%[r], #32]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #20]\n\t" + "STR r6, [%[r], #28]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #16]\n\t" + "STR r5, [%[r], #24]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a], #12]\n\t" + "STR r4, [%[r], #20]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "LDR r4, [%[a], #8]\n\t" + "STR r6, [%[r], #16]\n\t" + "LSR r3, r4, #1\n\t" + "LSL r4, r4, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r5, r5, r3\n\t" + "LDR r6, [%[a], #4]\n\t" + "STR r5, [%[r], #12]\n\t" + "LSR r3, r6, #1\n\t" + "LSL r6, r6, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r4, r4, r3\n\t" + "LDR r5, [%[a]]\n\t" + "STR r4, [%[r], #8]\n\t" + "LSR r3, r5, #1\n\t" + "LSL r5, r5, %[n]\n\t" + "LSR r3, r3, r7\n\t" + "ORR r6, r6, r3\n\t" + "STR r5, [%[r]]\n\t" + "STR r6, [%[r], #4]\n\t" + : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n) + : + : "memory", "r4", "r5", "r6", "r3", "r7" ); } @@ -39329,150 +59611,251 @@ static void sp_521_lshift_34(sp_digit* r, const sp_digit* a, byte n) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #64\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x40\n\t" + "\n" + "L_sp_521_sub_in_pkace_17_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_521_sub_in_pkace_17_word_%=\n\t" +#else + "BNE.N L_sp_521_sub_in_pkace_17_word_%=\n\t" +#endif + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2}\n\t" + "LDM %[b]!, {r6}\n\t" + "SBCS r2, r2, r6\n\t" + "STM %[a]!, {r2}\n\t" + "SBC %[a], %[a], %[a]\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #else -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_521_sub_in_place_17(sp_digit* a, - const sp_digit* b) +static sp_digit sp_521_sub_in_place_17(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r3, r3, r5\n\t" - "str r3, [%[a]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2}\n\t" + "LDM %[b]!, {r6}\n\t" + "SBCS r2, r2, r6\n\t" + "STM %[a]!, {r2}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #68\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_521_mul_d_17_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_521_mul_d_17_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_521_mul_d_17_word_%=\n\t" +#endif + "STR r3, [%[r], #68]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_521_mul_d_17(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "STR r5, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -39482,49 +59865,122 @@ SP_NOINLINE static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_521_word_17_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_521_word_17_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* AND m into each word of a and store in r. * * r A single precision integer. @@ -40177,333 +60633,368 @@ int sp_ecc_sign_521_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, W * a Number to divide. * m Modulus. */ -static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_521_div2_mod_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "ldr r4, [%[a]]\n\t" - "ands r8, r4, #1\n\t" - "beq 1f\n\t" - "mov r12, #0\n\t" - "ldr r5, [%[a], #4]\n\t" - "ldr r6, [%[a], #8]\n\t" - "ldr r7, [%[a], #12]\n\t" - "ldr r8, [%[m], #0]\n\t" - "ldr r9, [%[m], #4]\n\t" - "ldr r10, [%[m], #8]\n\t" - "ldr r14, [%[m], #12]\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #4]\n\t" - "str r6, [%[r], #8]\n\t" - "str r7, [%[r], #12]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #20]\n\t" - "ldr r6, [%[a], #24]\n\t" - "ldr r7, [%[a], #28]\n\t" - "ldr r8, [%[m], #16]\n\t" - "ldr r9, [%[m], #20]\n\t" - "ldr r10, [%[m], #24]\n\t" - "ldr r14, [%[m], #28]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #20]\n\t" - "str r6, [%[r], #24]\n\t" - "str r7, [%[r], #28]\n\t" - "ldr r4, [%[a], #32]\n\t" - "ldr r5, [%[a], #36]\n\t" - "ldr r6, [%[a], #40]\n\t" - "ldr r7, [%[a], #44]\n\t" - "ldr r8, [%[m], #32]\n\t" - "ldr r9, [%[m], #36]\n\t" - "ldr r10, [%[m], #40]\n\t" - "ldr r14, [%[m], #44]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #32]\n\t" - "str r5, [%[r], #36]\n\t" - "str r6, [%[r], #40]\n\t" - "str r7, [%[r], #44]\n\t" - "ldr r4, [%[a], #48]\n\t" - "ldr r5, [%[a], #52]\n\t" - "ldr r6, [%[a], #56]\n\t" - "ldr r7, [%[a], #60]\n\t" - "ldr r8, [%[m], #48]\n\t" - "ldr r9, [%[m], #52]\n\t" - "ldr r10, [%[m], #56]\n\t" - "ldr r14, [%[m], #60]\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "str r4, [%[r], #48]\n\t" - "str r5, [%[r], #52]\n\t" - "str r6, [%[r], #56]\n\t" - "str r7, [%[r], #60]\n\t" - "ldr r4, [%[a], #64]\n\t" - "ldr r8, [%[m], #64]\n\t" - "adcs r4, r4, r8\n\t" - "str r4, [%[r], #64]\n\t" - "adc r8, r12, r12\n\t" - "b 2f\n\t" - "\n1:\n\t" - "ldr r5, [%[a], #2]\n\t" - "str r4, [%[r], #0]\n\t" - "str r5, [%[r], #2]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[a], #6]\n\t" - "str r4, [%[r], #4]\n\t" - "str r5, [%[r], #6]\n\t" - "ldr r4, [%[a], #8]\n\t" - "ldr r5, [%[a], #10]\n\t" - "str r4, [%[r], #8]\n\t" - "str r5, [%[r], #10]\n\t" - "ldr r4, [%[a], #12]\n\t" - "ldr r5, [%[a], #14]\n\t" - "str r4, [%[r], #12]\n\t" - "str r5, [%[r], #14]\n\t" - "ldr r4, [%[a], #16]\n\t" - "ldr r5, [%[a], #18]\n\t" - "str r4, [%[r], #16]\n\t" - "str r5, [%[r], #18]\n\t" - "ldr r4, [%[a], #20]\n\t" - "ldr r5, [%[a], #22]\n\t" - "str r4, [%[r], #20]\n\t" - "str r5, [%[r], #22]\n\t" - "ldr r4, [%[a], #24]\n\t" - "ldr r5, [%[a], #26]\n\t" - "str r4, [%[r], #24]\n\t" - "str r5, [%[r], #26]\n\t" - "ldr r4, [%[a], #28]\n\t" - "ldr r5, [%[a], #30]\n\t" - "str r4, [%[r], #28]\n\t" - "str r5, [%[r], #30]\n\t" - "\n2:\n\t" - "ldr r3, [%[r]]\n\t" - "ldr r4, [%[r], #4]\n\t" - "lsr r3, r3, #1\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #8]\n\t" - "str r3, [%[r], #0]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #12]\n\t" - "str r4, [%[r], #4]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #16]\n\t" - "str r5, [%[r], #8]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #20]\n\t" - "str r3, [%[r], #12]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #24]\n\t" - "str r4, [%[r], #16]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #28]\n\t" - "str r5, [%[r], #20]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #32]\n\t" - "str r3, [%[r], #24]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #36]\n\t" - "str r4, [%[r], #28]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #40]\n\t" - "str r5, [%[r], #32]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #44]\n\t" - "str r3, [%[r], #36]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #48]\n\t" - "str r4, [%[r], #40]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #52]\n\t" - "str r5, [%[r], #44]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r5, [%[a], #56]\n\t" - "str r3, [%[r], #48]\n\t" - "orr r4, r4, r5, lsl #31\n\t" - "lsr r5, r5, #1\n\t" - "ldr r3, [%[a], #60]\n\t" - "str r4, [%[r], #52]\n\t" - "orr r5, r5, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #64]\n\t" - "str r5, [%[r], #56]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "orr r4, r4, r8, lsl #31\n\t" - "str r3, [%[r], #60]\n\t" - "str r4, [%[r], #64]\n\t" + "LDM %[a]!, {r4}\n\t" + "ANDS r3, r4, #0x1\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_div2_mod_17_even_%=\n\t" +#else + "BEQ.N L_sp_521_div2_mod_17_even_%=\n\t" +#endif + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4}\n\t" + "LDM %[m]!, {r8}\n\t" + "ADCS r4, r4, r8\n\t" + "STM %[r]!, {r4}\n\t" + "ADC r3, r12, r12\n\t" + "B L_sp_521_div2_mod_17_div2_%=\n\t" + "\n" + "L_sp_521_div2_mod_17_even_%=:\n\t" + "LDM %[a]!, {r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4}\n\t" + "STM %[r]!, {r4}\n\t" + "\n" + "L_sp_521_div2_mod_17_div2_%=:\n\t" + "SUB %[r], %[r], #0x44\n\t" + "LDRD r8, r9, [%[r]]\n\t" + "LSR r8, r8, #1\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #8]\n\t" + "STR r8, [%[r]]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #12]\n\t" + "STR r9, [%[r], #4]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #16]\n\t" + "STR r10, [%[r], #8]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #20]\n\t" + "STR r8, [%[r], #12]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #24]\n\t" + "STR r9, [%[r], #16]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #28]\n\t" + "STR r10, [%[r], #20]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #32]\n\t" + "STR r8, [%[r], #24]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #36]\n\t" + "STR r9, [%[r], #28]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #40]\n\t" + "STR r10, [%[r], #32]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #44]\n\t" + "STR r8, [%[r], #36]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #48]\n\t" + "STR r9, [%[r], #40]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #52]\n\t" + "STR r10, [%[r], #44]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "LDR r10, [%[r], #56]\n\t" + "STR r8, [%[r], #48]\n\t" + "ORR r9, r9, r10, lsl #31\n\t" + "LSR r10, r10, #1\n\t" + "LDR r8, [%[r], #60]\n\t" + "STR r9, [%[r], #52]\n\t" + "ORR r10, r10, r8, lsl #31\n\t" + "LSR r8, r8, #1\n\t" + "LDR r9, [%[r], #64]\n\t" + "STR r10, [%[r], #56]\n\t" + "ORR r8, r8, r9, lsl #31\n\t" + "LSR r9, r9, #1\n\t" + "ORR r9, r9, r3, lsl #31\n\t" + "STR r8, [%[r], #60]\n\t" + "STR r9, [%[r], #64]\n\t" + : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : [r] "r" (r), [a] "r" (a), [m] "r" (m) - : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); } -static int sp_521_num_bits_17(sp_digit* a) +static int sp_521_num_bits_17(const sp_digit* a_p) { - int r = 0; + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; __asm__ __volatile__ ( - "ldr r2, [%[a], #64]\n\t" - "cmp r2, #0\n\t" - "beq 16f\n\t" - "mov r3, #544\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n16:\n\t" - "ldr r2, [%[a], #60]\n\t" - "cmp r2, #0\n\t" - "beq 15f\n\t" - "mov r3, #512\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n15:\n\t" - "ldr r2, [%[a], #56]\n\t" - "cmp r2, #0\n\t" - "beq 14f\n\t" - "mov r3, #480\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n14:\n\t" - "ldr r2, [%[a], #52]\n\t" - "cmp r2, #0\n\t" - "beq 13f\n\t" - "mov r3, #448\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n13:\n\t" - "ldr r2, [%[a], #48]\n\t" - "cmp r2, #0\n\t" - "beq 12f\n\t" - "mov r3, #416\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n12:\n\t" - "ldr r2, [%[a], #44]\n\t" - "cmp r2, #0\n\t" - "beq 11f\n\t" - "mov r3, #384\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n11:\n\t" - "ldr r2, [%[a], #40]\n\t" - "cmp r2, #0\n\t" - "beq 10f\n\t" - "mov r3, #352\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n10:\n\t" - "ldr r2, [%[a], #36]\n\t" - "cmp r2, #0\n\t" - "beq 9f\n\t" - "mov r3, #320\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n9:\n\t" - "ldr r2, [%[a], #32]\n\t" - "cmp r2, #0\n\t" - "beq 8f\n\t" - "mov r3, #288\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n8:\n\t" - "ldr r2, [%[a], #28]\n\t" - "cmp r2, #0\n\t" - "beq 7f\n\t" - "mov r3, #256\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n7:\n\t" - "ldr r2, [%[a], #24]\n\t" - "cmp r2, #0\n\t" - "beq 6f\n\t" - "mov r3, #224\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n6:\n\t" - "ldr r2, [%[a], #20]\n\t" - "cmp r2, #0\n\t" - "beq 5f\n\t" - "mov r3, #192\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n5:\n\t" - "ldr r2, [%[a], #16]\n\t" - "cmp r2, #0\n\t" - "beq 4f\n\t" - "mov r3, #160\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n4:\n\t" - "ldr r2, [%[a], #12]\n\t" - "cmp r2, #0\n\t" - "beq 3f\n\t" - "mov r3, #128\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n3:\n\t" - "ldr r2, [%[a], #8]\n\t" - "cmp r2, #0\n\t" - "beq 2f\n\t" - "mov r3, #96\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n2:\n\t" - "ldr r2, [%[a], #4]\n\t" - "cmp r2, #0\n\t" - "beq 1f\n\t" - "mov r3, #64\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "b 18f\n\t" - "\n1:\n\t" - "ldr r2, [%[a], #0]\n\t" - "mov r3, #32\n\t" - "clz %[r], r2\n\t" - "sub %[r], r3, %[r]\n\t" - "\n18:\n\t" - : [r] "+r" (r) - : [a] "r" (a) - : "r2", "r3" + "LDR r1, [%[a], #64]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_16_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_16_%=\n\t" +#endif + "MOV r2, #0x220\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_16_%=:\n\t" + "LDR r1, [%[a], #60]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_15_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_15_%=\n\t" +#endif + "MOV r2, #0x200\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_15_%=:\n\t" + "LDR r1, [%[a], #56]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_14_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_14_%=\n\t" +#endif + "MOV r2, #0x1e0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_14_%=:\n\t" + "LDR r1, [%[a], #52]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_13_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_13_%=\n\t" +#endif + "MOV r2, #0x1c0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_13_%=:\n\t" + "LDR r1, [%[a], #48]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_12_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_12_%=\n\t" +#endif + "MOV r2, #0x1a0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_12_%=:\n\t" + "LDR r1, [%[a], #44]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_11_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_11_%=\n\t" +#endif + "MOV r2, #0x180\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_11_%=:\n\t" + "LDR r1, [%[a], #40]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_10_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_10_%=\n\t" +#endif + "MOV r2, #0x160\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_10_%=:\n\t" + "LDR r1, [%[a], #36]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_9_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_9_%=\n\t" +#endif + "MOV r2, #0x140\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_9_%=:\n\t" + "LDR r1, [%[a], #32]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_8_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_8_%=\n\t" +#endif + "MOV r2, #0x120\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_8_%=:\n\t" + "LDR r1, [%[a], #28]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_7_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_7_%=\n\t" +#endif + "MOV r2, #0x100\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_7_%=:\n\t" + "LDR r1, [%[a], #24]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_6_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_6_%=\n\t" +#endif + "MOV r2, #0xe0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_6_%=:\n\t" + "LDR r1, [%[a], #20]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_5_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_5_%=\n\t" +#endif + "MOV r2, #0xc0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_5_%=:\n\t" + "LDR r1, [%[a], #16]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_4_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_4_%=\n\t" +#endif + "MOV r2, #0xa0\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_4_%=:\n\t" + "LDR r1, [%[a], #12]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_3_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_3_%=\n\t" +#endif + "MOV r2, #0x80\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_3_%=:\n\t" + "LDR r1, [%[a], #8]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_2_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_2_%=\n\t" +#endif + "MOV r2, #0x60\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_2_%=:\n\t" + "LDR r1, [%[a], #4]\n\t" + "CMP r1, #0x0\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_521_num_bits_17_1_%=\n\t" +#else + "BEQ.N L_sp_521_num_bits_17_1_%=\n\t" +#endif + "MOV r2, #0x40\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "B L_sp_521_num_bits_17_18_%=\n\t" + "\n" + "L_sp_521_num_bits_17_1_%=:\n\t" + "LDR r1, [%[a]]\n\t" + "MOV r2, #0x20\n\t" + "CLZ r4, r1\n\t" + "SUB r4, r2, r4\n\t" + "\n" + "L_sp_521_num_bits_17_18_%=:\n\t" + "MOV %[a], r4\n\t" + : [a] "+r" (a) + : + : "memory", "r1", "r2", "r3", "r4", "r5" ); - - return r; + return (uint32_t)(size_t)a; } /* Non-constant time modular inversion. @@ -41525,83 +62016,1801 @@ typedef struct sp_point_1024 { * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_1024_mul_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[16 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #64\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #60\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #120\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x40\n\t" + /* A[0] * B[0] */ + "LDR r11, [%[a]]\n\t" + "LDR r12, [%[b]]\n\t" + "UMULL r3, r4, r11, r12\n\t" + "MOV r5, #0x0\n\t" + "STR r3, [sp]\n\t" + /* A[0] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[0] */ + "LDR r8, [%[a], #4]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #4]\n\t" + /* A[2] * B[0] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[1] */ + "LDR r11, [%[a], #4]\n\t" + "LDR r12, [%[b], #4]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[2] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #8]\n\t" + /* A[0] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[2] */ + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[1] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[0] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #12]\n\t" + /* A[4] * B[0] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[1] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[2] */ + "LDR r11, [%[a], #8]\n\t" + "LDR r12, [%[b], #8]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[3] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[4] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #16]\n\t" + /* A[0] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[4] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[2] */ + "LDR r8, [%[a], #12]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[1] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[0] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #20]\n\t" + /* A[6] * B[0] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[1] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[2] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[3] */ + "LDR r11, [%[a], #12]\n\t" + "LDR r12, [%[b], #12]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[4] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[5] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[6] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #24]\n\t" + /* A[0] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[6] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[5] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[4] */ + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[3] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[2] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[1] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[0] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #28]\n\t" + /* A[8] * B[0] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[1] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[2] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[3] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[4] */ + "LDR r11, [%[a], #16]\n\t" + "LDR r12, [%[b], #16]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[5] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[6] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[7] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[8] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #32]\n\t" + /* A[0] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[8] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[7] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[6] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[4] */ + "LDR r8, [%[a], #20]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[3] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[2] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[1] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[0] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #36]\n\t" + /* A[10] * B[0] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[1] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[2] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[3] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[4] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[5] */ + "LDR r11, [%[a], #20]\n\t" + "LDR r12, [%[b], #20]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[6] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[7] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[8] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[9] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[0] * B[10] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #40]\n\t" + /* A[0] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[10] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[9] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[8] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[7] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[6] */ + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[5] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[4] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[3] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[2] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[1] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[0] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #44]\n\t" + /* A[12] * B[0] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[1] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[2] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[3] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[4] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[5] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[6] */ + "LDR r11, [%[a], #24]\n\t" + "LDR r12, [%[b], #24]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[7] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[8] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[9] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[10] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[11] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[0] * B[12] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #48]\n\t" + /* A[0] * B[13] */ + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[12] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[11] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[10] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[9] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[8] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[6] */ + "LDR r8, [%[a], #28]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[5] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[4] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[3] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[2] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[1] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[0] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #52]\n\t" + /* A[14] * B[0] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[1] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[2] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[3] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[4] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[5] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[6] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[7] */ + "LDR r11, [%[a], #28]\n\t" + "LDR r12, [%[b], #28]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[8] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[9] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[10] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[11] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[2] * B[12] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * B[13] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[0] * B[14] */ + "LDR r8, [%[a]]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [sp, #56]\n\t" + /* A[0] * B[15] */ + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[1] * B[14] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[2] * B[13] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[12] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[11] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[10] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[9] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[8] */ + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[7] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[6] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[5] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[4] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[3] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[2] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[1] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[0] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b]]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [sp, #60]\n\t" + /* A[15] * B[1] */ + "LDR r9, [%[b], #4]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[2] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[3] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[4] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[5] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[6] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[7] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[8] */ + "LDR r11, [%[a], #32]\n\t" + "LDR r12, [%[b], #32]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[9] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[10] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[11] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[4] * B[12] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[3] * B[13] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[2] * B[14] */ + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * B[15] */ + "LDR r8, [%[a], #4]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #64]\n\t" + /* A[2] * B[15] */ + "LDR r8, [%[a], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[3] * B[14] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[4] * B[13] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[12] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[11] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[10] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[8] */ + "LDR r8, [%[a], #36]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[7] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[6] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[5] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[4] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[3] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[2] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #8]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #68]\n\t" + /* A[15] * B[3] */ + "LDR r9, [%[b], #12]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[4] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[5] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[6] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[7] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[8] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[9] */ + "LDR r11, [%[a], #36]\n\t" + "LDR r12, [%[b], #36]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[10] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[11] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[6] * B[12] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[5] * B[13] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[4] * B[14] */ + "LDR r8, [%[a], #16]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[3] * B[15] */ + "LDR r8, [%[a], #12]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #72]\n\t" + /* A[4] * B[15] */ + "LDR r8, [%[a], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[5] * B[14] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[6] * B[13] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[12] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[11] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[10] */ + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[9] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[8] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[7] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[6] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[5] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[4] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #16]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #76]\n\t" + /* A[15] * B[5] */ + "LDR r9, [%[b], #20]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[6] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[7] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[8] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[9] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[10] */ + "LDR r11, [%[a], #40]\n\t" + "LDR r12, [%[b], #40]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[11] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[8] * B[12] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[7] * B[13] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[6] * B[14] */ + "LDR r8, [%[a], #24]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[5] * B[15] */ + "LDR r8, [%[a], #20]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #80]\n\t" + /* A[6] * B[15] */ + "LDR r8, [%[a], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[7] * B[14] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[8] * B[13] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[12] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[10] */ + "LDR r8, [%[a], #44]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[9] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[8] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[7] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[6] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #24]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #84]\n\t" + /* A[15] * B[7] */ + "LDR r9, [%[b], #28]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[8] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[9] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[10] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[11] */ + "LDR r11, [%[a], #44]\n\t" + "LDR r12, [%[b], #44]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[10] * B[12] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[9] * B[13] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[8] * B[14] */ + "LDR r8, [%[a], #32]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[7] * B[15] */ + "LDR r8, [%[a], #28]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #88]\n\t" + /* A[8] * B[15] */ + "LDR r8, [%[a], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[9] * B[14] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[10] * B[13] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[12] */ + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[11] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[10] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[9] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[8] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #32]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #92]\n\t" + /* A[15] * B[9] */ + "LDR r9, [%[b], #36]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[10] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[11] */ + "LDR r8, [%[a], #52]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[12] * B[12] */ + "LDR r11, [%[a], #48]\n\t" + "LDR r12, [%[b], #48]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[11] * B[13] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[10] * B[14] */ + "LDR r8, [%[a], #40]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[9] * B[15] */ + "LDR r8, [%[a], #36]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #96]\n\t" + /* A[10] * B[15] */ + "LDR r8, [%[a], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[11] * B[14] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * B[13] */ + "LDR r9, [%[b], #52]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[12] */ + "LDR r8, [%[a], #52]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[11] */ + "LDR r8, [%[a], #56]\n\t" + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[15] * B[10] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #40]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #100]\n\t" + /* A[15] * B[11] */ + "LDR r9, [%[b], #44]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[14] * B[12] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * B[13] */ + "LDR r11, [%[a], #52]\n\t" + "LDR r12, [%[b], #52]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[12] * B[14] */ + "LDR r8, [%[a], #48]\n\t" + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[11] * B[15] */ + "LDR r8, [%[a], #44]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #104]\n\t" + /* A[12] * B[15] */ + "LDR r8, [%[a], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "MOV r5, #0x0\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[13] * B[14] */ + "LDR r9, [%[b], #56]\n\t" + "UMULL r6, r7, r11, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[14] * B[13] */ + "LDR r8, [%[a], #56]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + /* A[15] * B[12] */ + "LDR r8, [%[a], #60]\n\t" + "LDR r9, [%[b], #48]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], #108]\n\t" + /* A[15] * B[13] */ + "UMULL r6, r7, r8, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[14] * B[14] */ + "LDR r11, [%[a], #56]\n\t" + "LDR r12, [%[b], #56]\n\t" + "UMULL r6, r7, r11, r12\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * B[15] */ + "LDR r8, [%[a], #52]\n\t" + "LDR r9, [%[b], #60]\n\t" + "UMULL r6, r7, r8, r9\n\t" + "ADDS r4, r4, r6\n\t" + "ADCS r5, r5, r7\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #112]\n\t" + /* A[14] * B[15] */ + "UMULL r6, r7, r11, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[15] * B[14] */ + "LDR r8, [%[a], #60]\n\t" + "UMULL r6, r7, r8, r12\n\t" + "ADDS r5, r5, r6\n\t" + "ADCS r3, r3, r7\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r5, [%[r], #116]\n\t" + /* A[15] * B[15] */ + "UMLAL r3, r4, r8, r9\n\t" + "STR r3, [%[r], #120]\n\t" + "STR r4, [%[r], #124]\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM sp!, {r3, r4, r5, r6}\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r11", "r12" + ); } /* Square a and put result in r. (r = a * a) @@ -41609,122 +63818,1144 @@ SP_NOINLINE static void sp_1024_mul_16(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a) +static void sp_1024_sqr_16(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #128\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #60\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" -#else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" -#else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #64\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" -#else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #120\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #124\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #128\n\t" - "add sp, sp, r6\n\t" + "SUB sp, sp, #0x40\n\t" + /* A[0] * A[0] */ + "LDR r10, [%[a]]\n\t" + "UMULL r8, r3, r10, r10\n\t" + "MOV r4, #0x0\n\t" + "STR r8, [sp]\n\t" + /* A[0] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #4]\n\t" + /* A[0] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[1] * A[1] */ + "LDR r10, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [sp, #8]\n\t" + /* A[0] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[1] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [sp, #12]\n\t" + /* A[0] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[1] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[2] * A[2] */ + "LDR r10, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [sp, #16]\n\t" + /* A[0] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #20]\n\t" + /* A[0] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[3] */ + "LDR r10, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #24]\n\t" + /* A[0] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #28]\n\t" + /* A[0] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[4] */ + "LDR r10, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #32]\n\t" + /* A[0] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #36]\n\t" + /* A[0] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[5] */ + "LDR r10, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #40]\n\t" + /* A[0] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #44]\n\t" + /* A[0] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[6] */ + "LDR r10, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #48]\n\t" + /* A[0] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [sp, #52]\n\t" + /* A[0] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[7] */ + "LDR r10, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [sp, #56]\n\t" + /* A[0] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a]]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[1] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[2] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [sp, #60]\n\t" + /* A[1] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #4]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[2] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[3] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[8] */ + "LDR r10, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #64]\n\t" + /* A[2] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #8]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[3] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[4] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #68]\n\t" + /* A[3] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #12]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[4] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[5] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[9] */ + "LDR r10, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #72]\n\t" + /* A[4] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #16]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[5] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[6] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #76]\n\t" + /* A[5] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #20]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[6] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[7] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[10] */ + "LDR r10, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #80]\n\t" + /* A[6] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #24]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[7] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[8] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #84]\n\t" + /* A[7] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #28]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[8] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[9] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[11] */ + "LDR r10, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #88]\n\t" + /* A[8] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #32]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r3, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[9] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[10] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r4, r4, r5\n\t" + "ADCS r2, r2, r6\n\t" + "ADC r3, r3, r7\n\t" + "STR r4, [%[r], #92]\n\t" + /* A[9] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #36]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r4, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[10] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[11] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[12] * A[12] */ + "LDR r10, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r5\n\t" + "ADCS r3, r3, r6\n\t" + "ADC r4, r4, r7\n\t" + "STR r2, [%[r], #96]\n\t" + /* A[10] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #40]\n\t" + "UMULL r5, r6, r10, r12\n\t" + "MOV r2, #0x0\n\t" + "MOV r7, #0x0\n\t" + /* A[11] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + /* A[12] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r5, r5, r8\n\t" + "ADCS r6, r6, r9\n\t" + "ADC r7, r7, #0x0\n\t" + "ADDS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADC r7, r7, r7\n\t" + "ADDS r3, r3, r5\n\t" + "ADCS r4, r4, r6\n\t" + "ADC r2, r2, r7\n\t" + "STR r3, [%[r], #100]\n\t" + /* A[11] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #44]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[12] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + /* A[13] * A[13] */ + "LDR r10, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #104]\n\t" + /* A[12] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #48]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "MOV r4, #0x0\n\t" + "ADC r4, r4, #0x0\n\t" + /* A[13] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "LDR r12, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "ADDS r2, r2, r8\n\t" + "ADCS r3, r3, r9\n\t" + "ADC r4, r4, #0x0\n\t" + "STR r2, [%[r], #108]\n\t" + /* A[13] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #52]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "MOV r2, #0x0\n\t" + "ADC r2, r2, #0x0\n\t" + /* A[14] * A[14] */ + "LDR r10, [%[a], #56]\n\t" + "UMULL r8, r9, r10, r10\n\t" + "ADDS r3, r3, r8\n\t" + "ADCS r4, r4, r9\n\t" + "ADC r2, r2, #0x0\n\t" + "STR r3, [%[r], #112]\n\t" + /* A[14] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "LDR r12, [%[a], #56]\n\t" + "UMULL r8, r9, r10, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r2, r2, r9\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, #0x0\n\t" + "STR r4, [%[r], #116]\n\t" + /* A[15] * A[15] */ + "LDR r10, [%[a], #60]\n\t" + "UMLAL r2, r3, r10, r10\n\t" + "STR r2, [%[r], #120]\n\t" + "STR r3, [%[r], #124]\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + "LDM sp!, {r2, r3, r4, r8}\n\t" + "STM %[r]!, {r2, r3, r4, r8}\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12" ); } @@ -41734,161 +64965,123 @@ SP_NOINLINE static void sp_1024_sqr_16(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_add_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_add_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } -/* Sub b from a into r. (r = a - b) +/* Sub b from a into a. (a -= b) * - * r A single precision integer. - * a A single precision integer. + * a A single precision integer and result. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "subs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "ldm %[a], {r3, r4}\n\t" - "ldm %[b]!, {r5, r6}\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "stm %[a]!, {r3, r4}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SUBS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC %[a], r9, r9\n\t" + : [a] "+r" (a), [b] "+r" (b) : - : "memory", "r3", "r4", "r5", "r6" + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); - - return c; + return (uint32_t)(size_t)a; } /* Add b to a into r. (r = a + b) @@ -41897,100 +65090,76 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adds r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "adcs r4, r4, r6\n\t" - "adcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADDS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "ADCS r3, r3, r7\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "MOV %[r], #0x0\n\t" + "ADC %[r], %[r], #0x0\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* AND m into each word of a and store in r. @@ -42068,59 +65237,47 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_sub_16(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_sub_16(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } /* Square a and put result in r. (r = a * a) @@ -42166,83 +65323,74 @@ SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit tmp_arr[32 * 2]; - sp_digit* tmp = tmp_arr; - __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r10, %[a]\n\t" - "mov r11, %[b]\n\t" - "mov r6, #128\n\t" - "add r6, r6, r10\n\t" - "mov r14, r6\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r5, #0\n\t" - "mov r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov %[b], r9\n\t" - "sub %[b], %[b], %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add %[b], %[b], r11\n\t" - "\n2:\n\t" - /* Multiply Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [%[b]]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply Done */ - "add %[a], %[a], #4\n\t" - "sub %[b], %[b], #4\n\t" - "cmp %[a], r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" -#else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, r9\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" -#else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r12\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #248\n\t" - "cmp r8, r6\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" -#else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r], r8]\n\t" - "mov %[a], r10\n\t" - "mov %[b], r11\n\t" - : - : [r] "r" (tmp), [a] "r" (a), [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" - ); + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; - XMEMCPY(r, tmp_arr, sizeof(tmp_arr)); + __asm__ __volatile__ ( + "SUB sp, sp, #0x100\n\t" + "MOV r5, #0x0\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "\n" + "L_sp_1024_mul_32_outer_%=:\n\t" + "SUBS r3, r5, #0x7c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_1024_mul_32_inner_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BEQ L_sp_1024_mul_32_inner_done_%=\n\t" +#else + "BEQ.N L_sp_1024_mul_32_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_1024_mul_32_inner_%=\n\t" +#else + "BLE.N L_sp_1024_mul_32_inner_%=\n\t" +#endif + "\n" + "L_sp_1024_mul_32_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0xf8\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLE L_sp_1024_mul_32_outer_%=\n\t" +#else + "BLE.N L_sp_1024_mul_32_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_1024_mul_32_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_1024_mul_32_store_%=\n\t" +#else + "BGT.N L_sp_1024_mul_32_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" + ); } /* Square a and put result in r. (r = a * a) @@ -42250,124 +65398,97 @@ SP_NOINLINE static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, * r A single precision integer. * a A single precision integer. */ -SP_NOINLINE static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) +static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mov r4, #0\n\t" - "mov r5, #0\n\t" - "mov r9, r3\n\t" - "mov r12, %[r]\n\t" - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "neg r6, r6\n\t" - "add sp, sp, r6\n\t" - "mov r11, sp\n\t" - "mov r10, %[a]\n\t" - "\n1:\n\t" - "mov %[r], #0\n\t" - "mov r6, #124\n\t" - "mov %[a], r9\n\t" - "subs %[a], %[a], r6\n\t" - "sbc r6, r6, r6\n\t" - "mvn r6, r6\n\t" - "and %[a], %[a], r6\n\t" - "mov r2, r9\n\t" - "sub r2, r2, %[a]\n\t" - "add %[a], %[a], r10\n\t" - "add r2, r2, r10\n\t" - "\n2:\n\t" - "cmp r2, %[a]\n\t" + "SUB sp, sp, #0x100\n\t" + "MOV r6, #0x0\n\t" + "MOV r7, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_1024_sqr_32_outer_%=:\n\t" + "SUBS r3, r5, #0x7c\n\t" + "IT cc\n\t" + "movcc r3, #0\n\t" + "SUB r4, r5, r3\n\t" + "\n" + "L_sp_1024_sqr_32_inner_%=:\n\t" + "CMP r4, r3\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 4f\n\t" + "BEQ L_sp_1024_sqr_32_op_sqr_%=\n\t" #else - "beq.n 4f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - /* Multiply * 2: Start */ - "ldr r6, [%[a]]\n\t" - "ldr r8, [r2]\n\t" - "umull r6, r8, r6, r8\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Multiply * 2: Done */ + "BEQ.N L_sp_1024_sqr_32_op_sqr_%=\n\t" +#endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[a], r4]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "bal L_sp_1024_sqr_32_op_done_%=\n\t" + "\n" + "L_sp_1024_sqr_32_op_sqr_%=:\n\t" + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" + "\n" + "L_sp_1024_sqr_32_op_done_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bal 5f\n\t" + "BEQ L_sp_1024_sqr_32_inner_done_%=\n\t" #else - "bal.n 5f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n4:\n\t" - /* Square: Start */ - "ldr r6, [%[a]]\n\t" - "umull r6, r8, r6, r6\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, %[r]\n\t" - /* Square: Done */ - "\n5:\n\t" - "add %[a], %[a], #4\n\t" - "sub r2, r2, #4\n\t" - "mov r6, #128\n\t" - "add r6, r6, r10\n\t" - "cmp %[a], r6\n\t" + "BEQ.N L_sp_1024_sqr_32_inner_done_%=\n\t" +#endif + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "beq 3f\n\t" + "BGT L_sp_1024_sqr_32_inner_done_%=\n\t" #else - "beq.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "cmp %[a], r2\n\t" + "BGT.N L_sp_1024_sqr_32_inner_done_%=\n\t" +#endif + "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bgt 3f\n\t" + "BLE L_sp_1024_sqr_32_inner_%=\n\t" #else - "bgt.n 3f\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r8, r9\n\t" - "add r8, r8, r10\n\t" - "cmp %[a], r8\n\t" + "BLE.N L_sp_1024_sqr_32_inner_%=\n\t" +#endif + "\n" + "L_sp_1024_sqr_32_inner_done_%=:\n\t" + "STR r6, [sp, r5]\n\t" + "MOV r6, r7\n\t" + "MOV r7, r8\n\t" + "MOV r8, #0x0\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0xf8\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 2b\n\t" + "BLE L_sp_1024_sqr_32_outer_%=\n\t" #else - "ble.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "\n3:\n\t" - "mov %[r], r11\n\t" - "mov r8, r9\n\t" - "str r3, [%[r], r8]\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "mov r5, #0\n\t" - "add r8, r8, #4\n\t" - "mov r9, r8\n\t" - "mov r6, #248\n\t" - "cmp r8, r6\n\t" + "BLE.N L_sp_1024_sqr_32_outer_%=\n\t" +#endif + "STR r6, [sp, r5]\n\t" + "\n" + "L_sp_1024_sqr_32_store_%=:\n\t" + "LDM sp!, {r6, r7, r8, r9}\n\t" + "STM %[r]!, {r6, r7, r8, r9}\n\t" + "SUBS r5, r5, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "ble 1b\n\t" + "BGT L_sp_1024_sqr_32_store_%=\n\t" #else - "ble.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov %[a], r10\n\t" - "str r3, [%[r], r8]\n\t" - "mov %[r], r12\n\t" - "mov %[a], r11\n\t" - "mov r3, #252\n\t" - "\n4:\n\t" - "ldr r6, [%[a], r3]\n\t" - "str r6, [%[r], r3]\n\t" - "subs r3, r3, #4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 4b\n\t" -#else - "bge.n 4b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "mov r6, #1\n\t" - "lsl r6, r6, #8\n\t" - "add sp, sp, r6\n\t" + "BGT.N L_sp_1024_sqr_32_store_%=\n\t" +#endif + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) - : "memory", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11" ); } @@ -42463,42 +65584,41 @@ static const sp_point_1024 p1024_base = { * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_sub_in_place_32(sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; - __asm__ __volatile__ ( - "mov r8, %[a]\n\t" - "add r8, r8, #128\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r3, [%[a]]\n\t" - "ldr r4, [%[a], #4]\n\t" - "ldr r5, [%[b]]\n\t" - "ldr r6, [%[b], #4]\n\t" - "sbcs r3, r3, r5\n\t" - "sbcs r4, r4, r6\n\t" - "str r3, [%[a]]\n\t" - "str r4, [%[a], #4]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #8\n\t" - "add %[b], %[b], #8\n\t" - "cmp %[a], r8\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" -#else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [a] "+r" (a), [b] "+r" (b) - : - : "memory", "r3", "r4", "r5", "r6", "r8" - ); + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; - return c; + __asm__ __volatile__ ( + "MOV r10, #0x0\n\t" + "ADD r11, %[a], #0x80\n\t" + "\n" + "L_sp_1024_sub_in_pkace_32_word_%=:\n\t" + "RSBS r10, r10, #0x0\n\t" + "LDM %[a], {r2, r3, r4, r5}\n\t" + "LDM %[b]!, {r6, r7, r8, r9}\n\t" + "SBCS r2, r2, r6\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "STM %[a]!, {r2, r3, r4, r5}\n\t" + "SBC r10, r10, r10\n\t" + "CMP %[a], r11\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BNE L_sp_1024_sub_in_pkace_32_word_%=\n\t" +#else + "BNE.N L_sp_1024_sub_in_pkace_32_word_%=\n\t" +#endif + "MOV %[a], r10\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" + ); + return (uint32_t)(size_t)a; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -42507,39 +65627,180 @@ SP_NOINLINE static sp_digit sp_1024_sub_in_place_32(sp_digit* a, * b A single precision number to subtract. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b, sp_digit m) +static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #128\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r5, [%[a], r8]\n\t" - "sbcs r5, r5, r6\n\t" - "sbcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "\n" + "L_sp_1024_cond_sub_32_words_%=:\n\t" + "SUBS r4, r8, r4\n\t" + "LDR r6, [%[a], r5]\n\t" + "LDR r7, [%[b], r5]\n\t" + "AND r7, r7, %[m]\n\t" + "SBCS r6, r6, r7\n\t" + "SBC r4, r8, r8\n\t" + "STR r6, [%[r], r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "CMP r5, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_1024_cond_sub_32_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_1024_cond_sub_32_words_%=\n\t" +#endif + "MOV %[r], r4\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } +#else +/* Conditionally subtract b from a using the mask m. + * m is -1 to subtract and 0 when not copying. + * + * r A single precision number representing condition subtract result. + * a A single precision number to subtract from. + * b A single precision number to subtract. + * m Mask value to apply. + */ +static sp_digit sp_1024_cond_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + + __asm__ __volatile__ ( + "MOV r5, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SUBS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "SBCS r6, r6, r8\n\t" + "SBCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "SBC %[r], r5, r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) * @@ -42547,84 +65808,271 @@ SP_NOINLINE static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "mov r8, #0\n\t" - "add r6, r6, #128\n\t" - "sub r8, r8, #1\n\t" - "\n1:\n\t" - "adds %[c], %[c], r8\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "adcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "mov %[c], #0\n\t" - "adc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r3, #0x0\n\t" + "ADD r12, %[a], #0x80\n\t" + "\n" + "L_sp_1024_add_32_word_%=:\n\t" + "ADDS r3, r3, #0x-1\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "MOV r4, #0x0\n\t" + "ADC r3, r4, #0x0\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_1024_add_32_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_1024_add_32_word_%=\n\t" +#endif + "MOV %[r], r3\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_SMALL /* Mul a by digit b into r. (r = a * b) * * r A single precision integer. * a A single precision integer. * b A single precision digit. */ -SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, - sp_digit b) +static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + __asm__ __volatile__ ( - "add r9, %[a], #128\n\t" /* A[0] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r5, r3, r6, %[b]\n\t" - "mov r4, #0\n\t" - "str r5, [%[r]], #4\n\t" - /* A[0] * B - Done */ - "\n1:\n\t" - "mov r5, #0\n\t" - /* A[] * B */ - "ldr r6, [%[a]], #4\n\t" - "umull r6, r8, r6, %[b]\n\t" - "adds r3, r3, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc r5, r5, #0\n\t" - /* A[] * B - Done */ - "str r3, [%[r]], #4\n\t" - "mov r3, r4\n\t" - "mov r4, r5\n\t" - "cmp %[a], r9\n\t" + "LDR r8, [%[a]]\n\t" + "UMULL r5, r3, %[b], r8\n\t" + "MOV r4, #0x0\n\t" + "STR r5, [%[r]]\n\t" + "MOV r5, #0x0\n\t" + "MOV r9, #0x4\n\t" + "\n" + "L_sp_1024_mul_d_32_word_%=:\n\t" + /* A[i] * B */ + "LDR r8, [%[a], r9]\n\t" + "UMULL r6, r7, %[b], r8\n\t" + "ADDS r3, r3, r6\n\t" + "ADCS r4, r4, r7\n\t" + "ADC r5, r5, #0x0\n\t" + "STR r3, [%[r], r9]\n\t" + "MOV r3, r4\n\t" + "MOV r4, r5\n\t" + "MOV r5, #0x0\n\t" + "ADD r9, r9, #0x4\n\t" + "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_1024_mul_d_32_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "str r3, [%[r]]\n\t" - : [r] "+r" (r), [a] "+r" (a) - : [b] "r" (b) - : "memory", "r3", "r4", "r5", "r6", "r8", "r9" + "BLT.N L_sp_1024_mul_d_32_word_%=\n\t" +#endif + "STR r3, [%[r], #128]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9" ); } +#else +/* Mul a by digit b into r. (r = a * b) + * + * r A single precision integer. + * a A single precision integer. + * b A single precision digit. + */ +static void sp_1024_mul_d_32(sp_digit* r_p, const sp_digit* a_p, sp_digit b_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register sp_digit b asm ("r2") = (sp_digit)b_p; + + __asm__ __volatile__ ( + /* A[0] * B */ + "LDM %[a]!, {r8}\n\t" + "UMULL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[1] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[2] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[3] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[4] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[5] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[6] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[7] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[8] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[9] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[10] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[11] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[12] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[13] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[14] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[15] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[16] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[17] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[18] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[19] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[20] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[21] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[22] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[23] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[24] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[25] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[26] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[27] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[28] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "MOV r3, #0x0\n\t" + /* A[29] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r5, r3, %[b], r8\n\t" + "STM %[r]!, {r5}\n\t" + "MOV r4, #0x0\n\t" + /* A[30] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r3, r4, %[b], r8\n\t" + "STM %[r]!, {r3}\n\t" + "MOV r5, #0x0\n\t" + /* A[31] * B */ + "LDM %[a]!, {r8}\n\t" + "UMLAL r4, r5, %[b], r8\n\t" + "STM %[r]!, {r4}\n\t" + "STR r5, [%[r]]\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8" + ); +} + +#endif /* WOLFSSL_SP_SMALL */ +#ifdef WOLFSSL_SP_USE_UDIV /* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) * * d1 The high order half of the number to divide. @@ -42634,49 +66082,122 @@ SP_NOINLINE static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, * * Note that this is an approximate div. It may give an answer 1 larger. */ -SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, - sp_digit div) +static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) { - sp_digit r = 0; + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; __asm__ __volatile__ ( - "lsr r6, %[div], #16\n\t" - "add r6, r6, #1\n\t" - "udiv r4, %[d1], r6\n\t" - "lsl r8, r4, #16\n\t" - "umull r4, r5, %[div], r8\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r5, %[d1], r6\n\t" - "lsl r4, r5, #16\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "lsl r4, %[d1], #16\n\t" - "orr r4, r4, %[d0], lsr #16\n\t" - "udiv r4, r4, r6\n\t" - "add r8, r8, r4\n\t" - "umull r4, r5, %[div], r4\n\t" - "subs %[d0], %[d0], r4\n\t" - "sbc %[d1], %[d1], r5\n\t" - "udiv r4, %[d0], %[div]\n\t" - "add r8, r8, r4\n\t" - "mov %[r], r8\n\t" - : [r] "+r" (r) - : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div) - : "r4", "r5", "r6", "r8" + "LSR r8, %[div], #16\n\t" + "ADD r5, r8, #0x1\n\t" + "UDIV r6, %[d1], r5\n\t" + "LSL r7, %[div], #16\n\t" + "LSL r6, r6, #16\n\t" + "UMULL r3, r4, %[div], r6\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "SUBS r3, %[d1], r5\n\t" + "SBC r9, r9, r9\n\t" + "ADD r9, r9, #0x1\n\t" + "RSB r10, r9, #0x0\n\t" + "LSL r9, r9, #16\n\t" + "AND r7, r7, r10\n\t" + "AND r8, r8, r10\n\t" + "SUBS %[d0], %[d0], r7\n\t" + "ADD r6, r6, r9\n\t" + "SBC %[d1], %[d1], r8\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "UMULL r3, r4, %[div], r3\n\t" + "SUBS %[d0], %[d0], r3\n\t" + "SBC %[d1], %[d1], r4\n\t" + "LSL r4, %[d1], #16\n\t" + "LSR r3, %[d0], #16\n\t" + "ORR r3, r3, r4\n\t" + "UDIV r3, r3, r5\n\t" + "ADD r6, r6, r3\n\t" + "MUL r3, %[div], r3\n\t" + "SUB %[d0], %[d0], r3\n\t" + "UDIV r3, %[d0], %[div]\n\t" + "ADD %[d1], r6, r3\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - return r; + return (uint32_t)(size_t)d1; } +#else +/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div) + * + * d1 The high order half of the number to divide. + * d0 The low order half of the number to divide. + * div The divisor. + * returns the result of the division. + * + * Note that this is an approximate div. It may give an answer 1 larger. + */ +static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +{ + register sp_digit d1 asm ("r0") = (sp_digit)d1_p; + register sp_digit d0 asm ("r1") = (sp_digit)d0_p; + register sp_digit div asm ("r2") = (sp_digit)div_p; + + __asm__ __volatile__ ( + "LSR r5, %[div], #1\n\t" + "ADD r5, r5, #0x1\n\t" + "MOV r6, %[d0]\n\t" + "MOV r7, %[d1]\n\t" + /* Do top 32 */ + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "MOV r3, #0x0\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + /* Next 30 bits */ + "MOV r4, #0x1d\n\t" + "\n" + "L_div_1024_word_32_bit_%=:\n\t" + "LSLS r6, r6, #1\n\t" + "ADC r7, r7, r7\n\t" + "SUBS r8, r5, r7\n\t" + "SBC r8, r8, r8\n\t" + "ADD r3, r3, r3\n\t" + "SUB r3, r3, r8\n\t" + "AND r8, r8, r5\n\t" + "SUBS r7, r7, r8\n\t" + "SUBS r4, r4, #0x1\n\t" + "bpl L_div_1024_word_32_bit_%=\n\t" + "ADD r3, r3, r3\n\t" + "ADD r3, r3, #0x1\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "UMULL r6, r7, r3, %[div]\n\t" + "SUBS r9, %[d0], r6\n\t" + "SBC r10, %[d1], r7\n\t" + "ADD r3, r3, r10\n\t" + "SUBS r8, %[div], r9\n\t" + "SBC r8, r8, r8\n\t" + "SUB %[d1], r3, r8\n\t" + : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)d1; +} + +#endif /* AND m into each word of a and store in r. * * r A single precision integer. @@ -42714,44 +66235,395 @@ static void sp_1024_mask_32(sp_digit* r, const sp_digit* a, sp_digit m) * return -ve, 0 or +ve if a is less than, equal to or greater than b * respectively. */ -SP_NOINLINE static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b) +static sp_int32 sp_1024_cmp_32(const sp_digit* a_p, const sp_digit* b_p) { - sp_digit r = 0; - + register const sp_digit* a asm ("r0") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r1") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r3, #0\n\t" - "mvn r3, r3\n\t" - "mov r6, #124\n\t" - "\n1:\n\t" - "ldr r8, [%[a], r6]\n\t" - "ldr r5, [%[b], r6]\n\t" - "and r8, r8, r3\n\t" - "and r5, r5, r3\n\t" - "mov r4, r8\n\t" - "subs r8, r8, r5\n\t" - "sbc r8, r8, r8\n\t" - "add %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "subs r5, r5, r4\n\t" - "sbc r8, r8, r8\n\t" - "sub %[r], %[r], r8\n\t" - "mvn r8, r8\n\t" - "and r3, r3, r8\n\t" - "sub r6, r6, #4\n\t" - "cmp r6, #0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bge 1b\n\t" + "MOV r2, #0x-1\n\t" + "MOV r8, #0x1\n\t" + "MOV r7, #0x0\n\t" + "MOV r3, #0x-1\n\t" +#ifdef WOLFSSL_SP_SMALL + "MOV r6, #0x7c\n\t" + "\n" + "L_sp_1024_cmp_32_words_%=:\n\t" + "LDR r4, [%[a], r6]\n\t" + "LDR r5, [%[b], r6]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "SUBS r6, r6, #0x4\n\t" + "bcs L_sp_1024_cmp_32_words_%=\n\t" + "EOR r2, r2, r3\n\t" #else - "bge.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [r] "+r" (r) - : [a] "r" (a), [b] "r" (b) - : "r3", "r4", "r5", "r6", "r8" + "LDR r4, [%[a], #124]\n\t" + "LDR r5, [%[b], #124]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #120]\n\t" + "LDR r5, [%[b], #120]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #116]\n\t" + "LDR r5, [%[b], #116]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #112]\n\t" + "LDR r5, [%[b], #112]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #108]\n\t" + "LDR r5, [%[b], #108]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #104]\n\t" + "LDR r5, [%[b], #104]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #100]\n\t" + "LDR r5, [%[b], #100]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #96]\n\t" + "LDR r5, [%[b], #96]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #92]\n\t" + "LDR r5, [%[b], #92]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #88]\n\t" + "LDR r5, [%[b], #88]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #84]\n\t" + "LDR r5, [%[b], #84]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #80]\n\t" + "LDR r5, [%[b], #80]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #76]\n\t" + "LDR r5, [%[b], #76]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #72]\n\t" + "LDR r5, [%[b], #72]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #68]\n\t" + "LDR r5, [%[b], #68]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #64]\n\t" + "LDR r5, [%[b], #64]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #60]\n\t" + "LDR r5, [%[b], #60]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #56]\n\t" + "LDR r5, [%[b], #56]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #52]\n\t" + "LDR r5, [%[b], #52]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #48]\n\t" + "LDR r5, [%[b], #48]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #44]\n\t" + "LDR r5, [%[b], #44]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #40]\n\t" + "LDR r5, [%[b], #40]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #36]\n\t" + "LDR r5, [%[b], #36]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #32]\n\t" + "LDR r5, [%[b], #32]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #28]\n\t" + "LDR r5, [%[b], #28]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #24]\n\t" + "LDR r5, [%[b], #24]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #20]\n\t" + "LDR r5, [%[b], #20]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #16]\n\t" + "LDR r5, [%[b], #16]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #12]\n\t" + "LDR r5, [%[b], #12]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #8]\n\t" + "LDR r5, [%[b], #8]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a], #4]\n\t" + "LDR r5, [%[b], #4]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[b]]\n\t" + "AND r4, r4, r3\n\t" + "AND r5, r5, r3\n\t" + "SUBS r4, r4, r5\n\t" + "IT hi\n\t" + "movhi r2, r8\n\t" + "IT lo\n\t" + "movlo r2, r3\n\t" + "IT ne\n\t" + "movne r3, r7\n\t" + "EOR r2, r2, r3\n\t" +#endif /*WOLFSSL_SP_SMALL */ + "MOV %[a], r2\n\t" + : [a] "+r" (a), [b] "+r" (b) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8" ); - - return r; + return (uint32_t)(size_t)a; } /* Divide d in a and put remainder into r (m*d + r = a) @@ -42897,14 +66769,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 32 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 31); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 31); } #elif DIGIT_BIT > 32 unsigned int i; @@ -43082,113 +66954,525 @@ static int sp_1024_point_to_ecc_point_32(const sp_point_1024* p, ecc_point* pm) return err; } +#ifdef WOLFSSL_SP_NO_UMAAL /* Reduce the number back to 1024 bits using Montgomery reduction. * * a A single precision number to reduce in place. * m The single precision number representing the modulus. * mp The digit representing the negative inverse of m mod 2^n. */ -SP_NOINLINE static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, - sp_digit mp) +static void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) { - sp_digit ca = 0; + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; __asm__ __volatile__ ( - "mov r9, %[mp]\n\t" - "mov r12, %[m]\n\t" - "mov r10, %[a]\n\t" - "mov r4, #0\n\t" - "add r11, r10, #128\n\t" - "\n1:\n\t" + "LDR lr, [%[m]]\n\t" + /* i = 0 */ + "MOV r11, #0x0\n\t" + "MOV r3, #0x0\n\t" + "LDR r4, [%[a]]\n\t" + "LDR r5, [%[a], #4]\n\t" + "\n" + "L_sp_1024_mont_reduce_32_word_%=:\n\t" /* mu = a[i] * mp */ - "mov %[mp], r9\n\t" - "ldr %[a], [r10]\n\t" - "mul %[mp], %[mp], %[a]\n\t" - "mov %[m], r12\n\t" - "add r14, r10, #120\n\t" - "\n2:\n\t" - /* a[i+j] += m[j] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" - /* a[i+j+1] += m[j+1] * mu */ - "ldr %[a], [r10]\n\t" - "mov r4, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r4, r4, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r5, r5, %[a]\n\t" - "adc r4, r4, #0\n\t" - "str r5, [r10], #4\n\t" - "cmp r10, r14\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 2b\n\t" -#else - "blt.n 2b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ + "MUL r10, %[mp], r4\n\t" + /* a[i+0] += m[0] * mu */ + "MOV r7, #0x0\n\t" + "UMLAL r4, r7, r10, lr\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r9, [%[m], #4]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r5, r6, r10, r9\n\t" + "MOV r4, r5\n\t" + "ADDS r4, r4, r7\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r9, [%[m], #8]\n\t" + "LDR r5, [%[a], #8]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r5, r7, r10, r9\n\t" + "ADDS r5, r5, r6\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r9, [%[m], #12]\n\t" + "LDR r12, [%[a], #12]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #12]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r9, [%[m], #16]\n\t" + "LDR r12, [%[a], #16]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #16]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r9, [%[m], #20]\n\t" + "LDR r12, [%[a], #20]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #20]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r9, [%[m], #24]\n\t" + "LDR r12, [%[a], #24]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #24]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r9, [%[m], #28]\n\t" + "LDR r12, [%[a], #28]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #28]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r9, [%[m], #32]\n\t" + "LDR r12, [%[a], #32]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #32]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r9, [%[m], #36]\n\t" + "LDR r12, [%[a], #36]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #36]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r9, [%[m], #40]\n\t" + "LDR r12, [%[a], #40]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #40]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r9, [%[m], #44]\n\t" + "LDR r12, [%[a], #44]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #44]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r9, [%[m], #48]\n\t" + "LDR r12, [%[a], #48]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #48]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r9, [%[m], #52]\n\t" + "LDR r12, [%[a], #52]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #52]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r9, [%[m], #56]\n\t" + "LDR r12, [%[a], #56]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #56]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r9, [%[m], #60]\n\t" + "LDR r12, [%[a], #60]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #60]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r9, [%[m], #64]\n\t" + "LDR r12, [%[a], #64]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #64]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r9, [%[m], #68]\n\t" + "LDR r12, [%[a], #68]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #68]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r9, [%[m], #72]\n\t" + "LDR r12, [%[a], #72]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #72]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r9, [%[m], #76]\n\t" + "LDR r12, [%[a], #76]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #76]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r9, [%[m], #80]\n\t" + "LDR r12, [%[a], #80]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #80]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r9, [%[m], #84]\n\t" + "LDR r12, [%[a], #84]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #84]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r9, [%[m], #88]\n\t" + "LDR r12, [%[a], #88]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #88]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r9, [%[m], #92]\n\t" + "LDR r12, [%[a], #92]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #92]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r9, [%[m], #96]\n\t" + "LDR r12, [%[a], #96]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #96]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r9, [%[m], #100]\n\t" + "LDR r12, [%[a], #100]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #100]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r9, [%[m], #104]\n\t" + "LDR r12, [%[a], #104]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #104]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r9, [%[m], #108]\n\t" + "LDR r12, [%[a], #108]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #108]\n\t" + "ADC r6, r6, #0x0\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r9, [%[m], #112]\n\t" + "LDR r12, [%[a], #112]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #112]\n\t" + "ADC r7, r7, #0x0\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r9, [%[m], #116]\n\t" + "LDR r12, [%[a], #116]\n\t" + "MOV r6, #0x0\n\t" + "UMLAL r12, r6, r10, r9\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #116]\n\t" + "ADC r6, r6, #0x0\n\t" /* a[i+30] += m[30] * mu */ - "ldr %[a], [r10]\n\t" - "mov r5, #0\n\t" - /* Multiply m[j] and mu - Start */ - "ldr r8, [%[m]], #4\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds %[a], %[a], r6\n\t" - "adc r5, r5, r8\n\t" - /* Multiply m[j] and mu - Done */ - "adds r4, r4, %[a]\n\t" - "adc r5, r5, #0\n\t" - "str r4, [r10], #4\n\t" + "LDR r9, [%[m], #120]\n\t" + "LDR r12, [%[a], #120]\n\t" + "MOV r7, #0x0\n\t" + "UMLAL r12, r7, r10, r9\n\t" + "ADDS r12, r12, r6\n\t" + "STR r12, [%[a], #120]\n\t" + "ADC r7, r7, #0x0\n\t" /* a[i+31] += m[31] * mu */ - "mov r4, %[ca]\n\t" - "mov %[ca], #0\n\t" - /* Multiply m[31] and mu - Start */ - "ldr r8, [%[m]]\n\t" - "umull r6, r8, %[mp], r8\n\t" - "adds r5, r5, r6\n\t" - "adcs r4, r4, r8\n\t" - "adc %[ca], %[ca], #0\n\t" - /* Multiply m[31] and mu - Done */ - "ldr r6, [r10]\n\t" - "ldr r8, [r10, #4]\n\t" - "adds r6, r6, r5\n\t" - "adcs r8, r8, r4\n\t" - "adc %[ca], %[ca], #0\n\t" - "str r6, [r10]\n\t" - "str r8, [r10, #4]\n\t" - /* Next word in a */ - "sub r10, r10, #120\n\t" - "cmp r10, r11\n\t" + "LDR r9, [%[m], #124]\n\t" + "LDR r12, [%[a], #124]\n\t" + "UMULL r8, r9, r10, r9\n\t" + "ADDS r7, r7, r8\n\t" + "ADCS r6, r9, r3\n\t" + "MOV r3, #0x0\n\t" + "ADC r3, r3, r3\n\t" + "ADDS r12, r12, r7\n\t" + "STR r12, [%[a], #124]\n\t" + "LDR r12, [%[a], #128]\n\t" + "ADCS r12, r12, r6\n\t" + "STR r12, [%[a], #128]\n\t" + "ADC r3, r3, #0x0\n\t" + /* i += 1 */ + "ADD r11, r11, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r11, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_1024_mont_reduce_32_word_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - "ldr r6, [%[m]]\n\t" - "subs r6, r6, r8\n\t" - "neg %[ca], %[ca]\n\t" - "sbc r6, r6, r6\n\t" - "orr %[ca], %[ca], r6\n\t" - "mov %[a], r10\n\t" - "mov %[m], r12\n\t" - : [ca] "+r" (ca), [a] "+r" (a) - : [m] "r" (m), [mp] "r" (mp) - : "memory", "r4", "r5", "r6", "r8", "r9", "r10", "r11", "r12", "r14" + "BLT.N L_sp_1024_mont_reduce_32_word_%=\n\t" +#endif + /* Loop Done */ + "STR r4, [%[a]]\n\t" + "STR r5, [%[a], #4]\n\t" + "LDR r8, [%[m], #124]\n\t" + "SUBS r12, r8, r12\n\t" + "neg r3, r3\n\t" + "SBC r12, r12, r12\n\t" + "ORR r3, r3, r12\n\t" + "MOV %[mp], r3\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" ); - - sp_1024_cond_sub_32(a - 32, a, m, ca); + sp_1024_cond_sub_32(a - 32, a, m, mp); } +#else +/* Reduce the number back to 1024 bits using Montgomery reduction. + * + * a A single precision number to reduce in place. + * m The single precision number representing the modulus. + * mp The digit representing the negative inverse of m mod 2^n. + */ +static void sp_1024_mont_reduce_32(sp_digit* a_p, const sp_digit* m_p, sp_digit mp_p) +{ + register sp_digit* a asm ("r0") = (sp_digit*)a_p; + register const sp_digit* m asm ("r1") = (const sp_digit*)m_p; + register sp_digit mp asm ("r2") = (sp_digit)mp_p; + + __asm__ __volatile__ ( + /* i = 0 */ + "MOV r4, #0x0\n\t" + "MOV r5, #0x0\n\t" + "LDR r6, [%[a]]\n\t" + "LDR r7, [%[a], #4]\n\t" + "LDR r8, [%[a], #8]\n\t" + "LDR r9, [%[a], #12]\n\t" + "LDR r10, [%[a], #16]\n\t" + "\n" + "L_sp_1024_mont_reduce_32_word_%=:\n\t" + /* mu = a[i] * mp */ + "MUL lr, %[mp], r6\n\t" + /* a[i+0] += m[0] * mu */ + "LDR r12, [%[m]]\n\t" + "MOV r3, #0x0\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+1] += m[1] * mu */ + "LDR r12, [%[m], #4]\n\t" + "MOV r6, r7\n\t" + "UMAAL r6, r3, lr, r12\n\t" + /* a[i+2] += m[2] * mu */ + "LDR r12, [%[m], #8]\n\t" + "MOV r7, r8\n\t" + "UMAAL r7, r3, lr, r12\n\t" + /* a[i+3] += m[3] * mu */ + "LDR r12, [%[m], #12]\n\t" + "MOV r8, r9\n\t" + "UMAAL r8, r3, lr, r12\n\t" + /* a[i+4] += m[4] * mu */ + "LDR r12, [%[m], #16]\n\t" + "MOV r9, r10\n\t" + "UMAAL r9, r3, lr, r12\n\t" + /* a[i+5] += m[5] * mu */ + "LDR r12, [%[m], #20]\n\t" + "LDR r10, [%[a], #20]\n\t" + "UMAAL r10, r3, lr, r12\n\t" + /* a[i+6] += m[6] * mu */ + "LDR r12, [%[m], #24]\n\t" + "LDR r11, [%[a], #24]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #24]\n\t" + /* a[i+7] += m[7] * mu */ + "LDR r12, [%[m], #28]\n\t" + "LDR r11, [%[a], #28]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #28]\n\t" + /* a[i+8] += m[8] * mu */ + "LDR r12, [%[m], #32]\n\t" + "LDR r11, [%[a], #32]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #32]\n\t" + /* a[i+9] += m[9] * mu */ + "LDR r12, [%[m], #36]\n\t" + "LDR r11, [%[a], #36]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #36]\n\t" + /* a[i+10] += m[10] * mu */ + "LDR r12, [%[m], #40]\n\t" + "LDR r11, [%[a], #40]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #40]\n\t" + /* a[i+11] += m[11] * mu */ + "LDR r12, [%[m], #44]\n\t" + "LDR r11, [%[a], #44]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #44]\n\t" + /* a[i+12] += m[12] * mu */ + "LDR r12, [%[m], #48]\n\t" + "LDR r11, [%[a], #48]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #48]\n\t" + /* a[i+13] += m[13] * mu */ + "LDR r12, [%[m], #52]\n\t" + "LDR r11, [%[a], #52]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #52]\n\t" + /* a[i+14] += m[14] * mu */ + "LDR r12, [%[m], #56]\n\t" + "LDR r11, [%[a], #56]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #56]\n\t" + /* a[i+15] += m[15] * mu */ + "LDR r12, [%[m], #60]\n\t" + "LDR r11, [%[a], #60]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #60]\n\t" + /* a[i+16] += m[16] * mu */ + "LDR r12, [%[m], #64]\n\t" + "LDR r11, [%[a], #64]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #64]\n\t" + /* a[i+17] += m[17] * mu */ + "LDR r12, [%[m], #68]\n\t" + "LDR r11, [%[a], #68]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #68]\n\t" + /* a[i+18] += m[18] * mu */ + "LDR r12, [%[m], #72]\n\t" + "LDR r11, [%[a], #72]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #72]\n\t" + /* a[i+19] += m[19] * mu */ + "LDR r12, [%[m], #76]\n\t" + "LDR r11, [%[a], #76]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #76]\n\t" + /* a[i+20] += m[20] * mu */ + "LDR r12, [%[m], #80]\n\t" + "LDR r11, [%[a], #80]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #80]\n\t" + /* a[i+21] += m[21] * mu */ + "LDR r12, [%[m], #84]\n\t" + "LDR r11, [%[a], #84]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #84]\n\t" + /* a[i+22] += m[22] * mu */ + "LDR r12, [%[m], #88]\n\t" + "LDR r11, [%[a], #88]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #88]\n\t" + /* a[i+23] += m[23] * mu */ + "LDR r12, [%[m], #92]\n\t" + "LDR r11, [%[a], #92]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #92]\n\t" + /* a[i+24] += m[24] * mu */ + "LDR r12, [%[m], #96]\n\t" + "LDR r11, [%[a], #96]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #96]\n\t" + /* a[i+25] += m[25] * mu */ + "LDR r12, [%[m], #100]\n\t" + "LDR r11, [%[a], #100]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #100]\n\t" + /* a[i+26] += m[26] * mu */ + "LDR r12, [%[m], #104]\n\t" + "LDR r11, [%[a], #104]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #104]\n\t" + /* a[i+27] += m[27] * mu */ + "LDR r12, [%[m], #108]\n\t" + "LDR r11, [%[a], #108]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #108]\n\t" + /* a[i+28] += m[28] * mu */ + "LDR r12, [%[m], #112]\n\t" + "LDR r11, [%[a], #112]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #112]\n\t" + /* a[i+29] += m[29] * mu */ + "LDR r12, [%[m], #116]\n\t" + "LDR r11, [%[a], #116]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #116]\n\t" + /* a[i+30] += m[30] * mu */ + "LDR r12, [%[m], #120]\n\t" + "LDR r11, [%[a], #120]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "STR r11, [%[a], #120]\n\t" + /* a[i+31] += m[31] * mu */ + "LDR r12, [%[m], #124]\n\t" + "LDR r11, [%[a], #124]\n\t" + "UMAAL r11, r3, lr, r12\n\t" + "LDR lr, [%[a], #128]\n\t" + "MOV r12, #0x0\n\t" + "UMAAL r3, lr, r12, r12\n\t" + "STR r11, [%[a], #124]\n\t" + "ADDS r3, r3, r5\n\t" + "ADC r5, lr, #0x0\n\t" + "STR r3, [%[a], #128]\n\t" + /* i += 1 */ + "ADD r4, r4, #0x4\n\t" + "ADD %[a], %[a], #0x4\n\t" + "CMP r4, #0x80\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_1024_mont_reduce_32_word_%=\n\t" +#else + "BLT.N L_sp_1024_mont_reduce_32_word_%=\n\t" +#endif + /* Loop Done */ + "STR r6, [%[a]]\n\t" + "STR r7, [%[a], #4]\n\t" + "STR r8, [%[a], #8]\n\t" + "STR r9, [%[a], #12]\n\t" + "STR r10, [%[a], #16]\n\t" + "LDR r12, [%[m], #124]\n\t" + "SUBS r11, r12, r11\n\t" + "neg r5, r5\n\t" + "SBC r11, r11, r11\n\t" + "ORR r5, r5, r11\n\t" + "MOV %[mp], r5\n\t" + : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp) + : + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr" + ); + sp_1024_cond_sub_32(a - 32, a, m, mp); +} + +#endif /* Multiply two Montgomery form numbers mod the modulus (prime). * (r = a * b mod m) * @@ -43333,166 +67617,169 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p, * b Second number to add in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_1024_mont_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldr r14, [%[m], #124]\n\t" - "adc r12, r12, #0\n\t" - "subs r14, r14, r7\n\t" - "neg r12, r12\n\t" - "sbc r14, r14, r14\n\t" - "sub %[r], %[r], #128\n\t" - "orr r12, r14\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "subs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbc r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "sub %[r], %[r], #128\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDR r11, [%[m], #124]\n\t" + "ADC r12, r12, #0x0\n\t" + "SUBS r11, r11, r7\n\t" + "neg r12, r12\n\t" + "SBC r11, r11, r11\n\t" + "SUB %[r], %[r], #0x80\n\t" + "ORR r12, r12, r11\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SUBS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBC r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -43502,149 +67789,152 @@ SP_NOINLINE static void sp_1024_mont_add_32(sp_digit* r, const sp_digit* a, cons * a Number to double in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_1024_mont_dbl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldr r4, [%[m], #124]\n\t" - "adc r12, r12, #0\n\t" - "subs r4, r4, r14\n\t" - "neg r12, r12\n\t" - "sbc r4, r4, r4\n\t" - "sub %[r], %[r], #128\n\t" - "orr r12, r4\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "subs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbc r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "sub %[r], %[r], #128\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDR r4, [%[m], #124]\n\t" + "ADC r12, r12, #0x0\n\t" + "SUBS r4, r4, r11\n\t" + "neg r12, r12\n\t" + "SBC r4, r4, r4\n\t" + "SUB %[r], %[r], #0x80\n\t" + "ORR r12, r12, r4\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SUBS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBC r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7", "r12" ); } @@ -43654,304 +67944,307 @@ SP_NOINLINE static void sp_1024_mont_dbl_32(sp_digit* r, const sp_digit* a, cons * a Number to triple in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_1024_mont_tpl_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* m asm ("r2") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adds r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "adcs r4, r4, r4\n\t" - "adcs r5, r5, r5\n\t" - "adcs r6, r6, r6\n\t" - "adcs r7, r7, r7\n\t" - "adcs r8, r8, r8\n\t" - "adcs r9, r9, r9\n\t" - "adcs r10, r10, r10\n\t" - "adcs r14, r14, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7, r8, r9, r10, r14}\n\t" - "ldr r4, [%[m], #124]\n\t" - "adc r12, r12, #0\n\t" - "subs r4, r4, r14\n\t" - "neg r12, r12\n\t" - "sbc r4, r4, r4\n\t" - "sub %[r], %[r], #128\n\t" - "orr r12, r4\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "subs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbc r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "sub %[r], %[r], #128\n\t" - "sub %[m], %[m], #128\n\t" - "sub %[a], %[a], #128\n\t" - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adds r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "adcs r8, r8, r4\n\t" - "adcs r9, r9, r5\n\t" - "adcs r10, r10, r6\n\t" - "adcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldr r7, [%[m], #124]\n\t" - "adc r12, r12, #0\n\t" - "subs r7, r7, r14\n\t" - "neg r12, r12\n\t" - "sbc r7, r7, r7\n\t" - "sub %[r], %[r], #128\n\t" - "orr r12, r7\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "subs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbcs r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "ldm %[r], {r8, r9, r10, r14}\n\t" - "ldm %[m]!, {r4, r5, r6, r7}\n\t" - "and r4, r4, r12\n\t" - "and r5, r5, r12\n\t" - "and r6, r6, r12\n\t" - "and r7, r7, r12\n\t" - "sbcs r8, r8, r4\n\t" - "sbcs r9, r9, r5\n\t" - "sbcs r10, r10, r6\n\t" - "sbc r14, r14, r7\n\t" - "stm %[r]!, {r8, r9, r10, r14}\n\t" - "sub %[r], %[r], #128\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADDS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "ADCS r4, r4, r4\n\t" + "ADCS r5, r5, r5\n\t" + "ADCS r6, r6, r6\n\t" + "ADCS r7, r7, r7\n\t" + "ADCS r8, r8, r8\n\t" + "ADCS r9, r9, r9\n\t" + "ADCS r10, r10, r10\n\t" + "ADCS r11, r11, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" + "LDR r4, [%[m], #124]\n\t" + "ADC r12, r12, #0x0\n\t" + "SUBS r4, r4, r11\n\t" + "neg r12, r12\n\t" + "SBC r4, r4, r4\n\t" + "SUB %[r], %[r], #0x80\n\t" + "ORR r12, r12, r4\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SUBS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBC r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "SUB %[r], %[r], #0x80\n\t" + "SUB %[m], %[m], #0x80\n\t" + "SUB %[a], %[a], #0x80\n\t" + "MOV r12, #0x0\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADDS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r8, r9, r10, r11}\n\t" + "ADCS r8, r8, r4\n\t" + "ADCS r9, r9, r5\n\t" + "ADCS r10, r10, r6\n\t" + "ADCS r11, r11, r7\n\t" + "STM %[r]!, {r8, r9, r10, r11}\n\t" + "LDR r7, [%[m], #124]\n\t" + "ADC r12, r12, #0x0\n\t" + "SUBS r7, r7, r11\n\t" + "neg r12, r12\n\t" + "SBC r7, r7, r7\n\t" + "SUB %[r], %[r], #0x80\n\t" + "ORR r12, r12, r7\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SUBS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBC r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" : [r] "+r" (r), [a] "+r" (a), [m] "+r" (m) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r8", "r9", "r10", "r11", "r4", "r5", "r6", "r7", "r12" ); } @@ -43962,165 +68255,167 @@ SP_NOINLINE static void sp_1024_mont_tpl_32(sp_digit* r, const sp_digit* a, cons * b Number to subtract with in Montgomery form. * m Modulus (prime). */ -SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, - const sp_digit* m) +static void sp_1024_mont_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, const sp_digit* m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register const sp_digit* m asm ("r3") = (const sp_digit*)m_p; + __asm__ __volatile__ ( - "mov r12, #0\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "subs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[a]!, {r4, r5, r6, r7}\n\t" - "ldm %[b]!, {r8, r9, r10, r14}\n\t" - "sbcs r4, r4, r8\n\t" - "sbcs r5, r5, r9\n\t" - "sbcs r6, r6, r10\n\t" - "sbcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "sbc r12, r12, r12\n\t" - "sub %[r], %[r], #128\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adds r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adcs r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "ldm %[r], {r4, r5, r6, r7}\n\t" - "ldm %[m]!, {r8, r9, r10, r14}\n\t" - "and r8, r8, r12\n\t" - "and r9, r9, r12\n\t" - "and r10, r10, r12\n\t" - "and r14, r14, r12\n\t" - "adcs r4, r4, r8\n\t" - "adcs r5, r5, r9\n\t" - "adcs r6, r6, r10\n\t" - "adc r7, r7, r14\n\t" - "stm %[r]!, {r4, r5, r6, r7}\n\t" - "sub %[r], %[r], #128\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SUBS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[a]!, {r4, r5, r6, r7}\n\t" + "LDM %[b]!, {r8, r9, r10, r11}\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "SBCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "SBC r12, r12, r12\n\t" + "SUB %[r], %[r], #0x80\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADDS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADCS r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" + "LDM %[r], {r4, r5, r6, r7}\n\t" + "LDM %[m]!, {r8, r9, r10, r11}\n\t" + "AND r8, r8, r12\n\t" + "AND r9, r9, r12\n\t" + "AND r10, r10, r12\n\t" + "AND r11, r11, r12\n\t" + "ADCS r4, r4, r8\n\t" + "ADCS r5, r5, r9\n\t" + "ADCS r6, r6, r10\n\t" + "ADC r7, r7, r11\n\t" + "STM %[r]!, {r4, r5, r6, r7}\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) : - : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r14", "r12" + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } -#define sp_1024_mont_sub_lower_32 sp_1024_mont_sub_32 +#ifdef WOLFSSL_SP_SMALL /* Conditionally add a and b using the mask m. * m is -1 to add and 0 when not. * @@ -44129,171 +68424,314 @@ SP_NOINLINE static void sp_1024_mont_sub_32(sp_digit* r, const sp_digit* a, cons * b A single precision number to add. * m Mask value to apply. */ -SP_NOINLINE static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, - sp_digit m) +static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; __asm__ __volatile__ ( - "mov r5, #128\n\t" - "mov r9, r5\n\t" - "mov r8, #0\n\t" - "\n1:\n\t" - "ldr r6, [%[b], r8]\n\t" - "and r6, r6, %[m]\n\t" - "adds r5, %[c], #-1\n\t" - "ldr r5, [%[a], r8]\n\t" - "adcs r5, r5, r6\n\t" - "mov %[c], #0\n\t" - "adcs %[c], %[c], %[c]\n\t" - "str r5, [%[r], r8]\n\t" - "add r8, r8, #4\n\t" - "cmp r8, r9\n\t" + "MOV r5, #0x0\n\t" + "MOV r8, #0x0\n\t" + "MOV r4, #0x0\n\t" + "\n" + "L_sp_1024_cond_add_32_words_%=:\n\t" + "ADDS r5, r5, #0x-1\n\t" + "LDR r6, [%[a], r4]\n\t" + "LDR r7, [%[b], r4]\n\t" + "AND r7, r7, %[m]\n\t" + "ADCS r6, r6, r7\n\t" + "ADC r5, r8, r8\n\t" + "STR r6, [%[r], r4]\n\t" + "ADD r4, r4, #0x4\n\t" + "CMP r4, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "blt 1b\n\t" + "BLT L_sp_1024_cond_add_32_words_%=\n\t" #else - "blt.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c) - : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m) - : "memory", "r5", "r6", "r8", "r9" + "BLT.N L_sp_1024_cond_add_32_words_%=\n\t" +#endif + "MOV %[r], r5\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8" ); - - return c; + return (uint32_t)(size_t)r; } -static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a) +#else +/* Conditionally add a and b using the mask m. + * m is -1 to add and 0 when not. + * + * r A single precision number representing conditional add result. + * a A single precision number to add with. + * b A single precision number to add. + * m Mask value to apply. + */ +static sp_digit sp_1024_cond_add_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p, sp_digit m_p) { + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; + register sp_digit m asm ("r3") = (sp_digit)m_p; + __asm__ __volatile__ ( - "ldr r2, [%[a]]\n\t" - "ldr r3, [%[a], #4]\n\t" - "lsr r2, r2, #1\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #8]\n\t" - "str r2, [%[r], #0]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #12]\n\t" - "str r3, [%[r], #4]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #16]\n\t" - "str r4, [%[r], #8]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #20]\n\t" - "str r2, [%[r], #12]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #24]\n\t" - "str r3, [%[r], #16]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #28]\n\t" - "str r4, [%[r], #20]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #32]\n\t" - "str r2, [%[r], #24]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #36]\n\t" - "str r3, [%[r], #28]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #40]\n\t" - "str r4, [%[r], #32]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #44]\n\t" - "str r2, [%[r], #36]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #48]\n\t" - "str r3, [%[r], #40]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #52]\n\t" - "str r4, [%[r], #44]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #56]\n\t" - "str r2, [%[r], #48]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #60]\n\t" - "str r3, [%[r], #52]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #64]\n\t" - "str r4, [%[r], #56]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #68]\n\t" - "str r2, [%[r], #60]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #72]\n\t" - "str r3, [%[r], #64]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #76]\n\t" - "str r4, [%[r], #68]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #80]\n\t" - "str r2, [%[r], #72]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #84]\n\t" - "str r3, [%[r], #76]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #88]\n\t" - "str r4, [%[r], #80]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #92]\n\t" - "str r2, [%[r], #84]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #96]\n\t" - "str r3, [%[r], #88]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #100]\n\t" - "str r4, [%[r], #92]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #104]\n\t" - "str r2, [%[r], #96]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #108]\n\t" - "str r3, [%[r], #100]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #112]\n\t" - "str r4, [%[r], #104]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "ldr r4, [%[a], #116]\n\t" - "str r2, [%[r], #108]\n\t" - "orr r3, r3, r4, lsl #31\n\t" - "lsr r4, r4, #1\n\t" - "ldr r2, [%[a], #120]\n\t" - "str r3, [%[r], #112]\n\t" - "orr r4, r4, r2, lsl #31\n\t" - "lsr r2, r2, #1\n\t" - "ldr r3, [%[a], #124]\n\t" - "str r4, [%[r], #116]\n\t" - "orr r2, r2, r3, lsl #31\n\t" - "lsr r3, r3, #1\n\t" - "str r2, [%[r], #120]\n\t" - "str r3, [%[r], #124]\n\t" + "MOV r10, #0x0\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADDS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "LDM %[a]!, {r6, r7}\n\t" + "LDM %[b]!, {r8, r9}\n\t" + "AND r8, r8, %[m]\n\t" + "AND r9, r9, %[m]\n\t" + "ADCS r6, r6, r8\n\t" + "ADCS r7, r7, r9\n\t" + "STM %[r]!, {r6, r7}\n\t" + "ADC %[r], r10, r10\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10" + ); + return (uint32_t)(size_t)r; +} + +#endif /* WOLFSSL_SP_SMALL */ +static void sp_1024_rshift1_32(sp_digit* r_p, const sp_digit* a_p) +{ + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + + __asm__ __volatile__ ( + "LDM %[a], {r2, r3}\n\t" + "LSR r2, r2, #1\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #8]\n\t" + "STR r2, [%[r]]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #12]\n\t" + "STR r3, [%[r], #4]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #16]\n\t" + "STR r4, [%[r], #8]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #20]\n\t" + "STR r2, [%[r], #12]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #24]\n\t" + "STR r3, [%[r], #16]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #28]\n\t" + "STR r4, [%[r], #20]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #32]\n\t" + "STR r2, [%[r], #24]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #36]\n\t" + "STR r3, [%[r], #28]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #40]\n\t" + "STR r4, [%[r], #32]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #44]\n\t" + "STR r2, [%[r], #36]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #48]\n\t" + "STR r3, [%[r], #40]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #52]\n\t" + "STR r4, [%[r], #44]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #56]\n\t" + "STR r2, [%[r], #48]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #60]\n\t" + "STR r3, [%[r], #52]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #64]\n\t" + "STR r4, [%[r], #56]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #68]\n\t" + "STR r2, [%[r], #60]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #72]\n\t" + "STR r3, [%[r], #64]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #76]\n\t" + "STR r4, [%[r], #68]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #80]\n\t" + "STR r2, [%[r], #72]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #84]\n\t" + "STR r3, [%[r], #76]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #88]\n\t" + "STR r4, [%[r], #80]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #92]\n\t" + "STR r2, [%[r], #84]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #96]\n\t" + "STR r3, [%[r], #88]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #100]\n\t" + "STR r4, [%[r], #92]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #104]\n\t" + "STR r2, [%[r], #96]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #108]\n\t" + "STR r3, [%[r], #100]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #112]\n\t" + "STR r4, [%[r], #104]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "LDR r4, [%[a], #116]\n\t" + "STR r2, [%[r], #108]\n\t" + "ORR r3, r3, r4, lsl #31\n\t" + "LSR r4, r4, #1\n\t" + "LDR r2, [%[a], #120]\n\t" + "STR r3, [%[r], #112]\n\t" + "ORR r4, r4, r2, lsl #31\n\t" + "LSR r2, r2, #1\n\t" + "LDR r3, [%[a], #124]\n\t" + "STR r4, [%[r], #116]\n\t" + "ORR r2, r2, r3, lsl #31\n\t" + "LSR r3, r3, #1\n\t" + "STR r2, [%[r], #120]\n\t" + "STR r3, [%[r], #124]\n\t" + : [r] "+r" (r), [a] "+r" (a) : - : [r] "r" (r), [a] "r" (a) : "memory", "r2", "r3", "r4" ); } @@ -44304,7 +68742,7 @@ static void sp_1024_rshift1_32(sp_digit* r, const sp_digit* a) * a Number to divide. * m Modulus (prime). */ -SP_NOINLINE static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m) +static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m) { sp_digit o; @@ -44367,7 +68805,7 @@ static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_32(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -44489,7 +68927,7 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -44521,37 +68959,38 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "mov r6, %[a]\n\t" - "add r6, r6, #128\n\t" - "\n1:\n\t" - "mov r5, #0\n\t" - "subs r5, r5, %[c]\n\t" - "ldr r4, [%[a]]\n\t" - "ldr r5, [%[b]]\n\t" - "sbcs r4, r4, r5\n\t" - "str r4, [%[r]]\n\t" - "sbc %[c], %[c], %[c]\n\t" - "add %[a], %[a], #4\n\t" - "add %[b], %[b], #4\n\t" - "add %[r], %[r], #4\n\t" - "cmp %[a], r6\n\t" + "MOV r11, #0x0\n\t" + "ADD r12, %[a], #0x80\n\t" + "\n" + "L_sp_1024_sub_32_word_%=:\n\t" + "RSBS r11, r11, #0x0\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC r11, r3, r3\n\t" + "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "bne 1b\n\t" + "BNE L_sp_1024_sub_32_word_%=\n\t" #else - "bne.n 1b\n\t" -#endif /* __GNUC__ || __ICCARM__ || __IAR_SYSTEMS_ICC__ */ - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "BNE.N L_sp_1024_sub_32_word_%=\n\t" +#endif + "MOV %[r], r11\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); - - return c; + return (uint32_t)(size_t)r; } #else @@ -44561,99 +69000,75 @@ SP_NOINLINE static sp_digit sp_1024_sub_32(sp_digit* r, const sp_digit* a, * a A single precision integer. * b A single precision integer. */ -SP_NOINLINE static sp_digit sp_1024_sub_32(sp_digit* r, const sp_digit* a, - const sp_digit* b) +static sp_digit sp_1024_sub_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p) { - sp_digit c = 0; + register sp_digit* r asm ("r0") = (sp_digit*)r_p; + register const sp_digit* a asm ("r1") = (const sp_digit*)a_p; + register const sp_digit* b asm ("r2") = (const sp_digit*)b_p; __asm__ __volatile__ ( - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "subs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "ldm %[a]!, {r4, r5}\n\t" - "ldm %[b]!, {r6, r8}\n\t" - "sbcs r4, r4, r6\n\t" - "sbcs r5, r5, r8\n\t" - "stm %[r]!, {r4, r5}\n\t" - "sbc %[c], %[c], %[c]\n\t" - : [c] "+r" (c), [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SUBS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "LDM %[a]!, {r3, r4, r5, r6}\n\t" + "LDM %[b]!, {r7, r8, r9, r10}\n\t" + "SBCS r3, r3, r7\n\t" + "SBCS r4, r4, r8\n\t" + "SBCS r5, r5, r9\n\t" + "SBCS r6, r6, r10\n\t" + "STM %[r]!, {r3, r4, r5, r6}\n\t" + "SBC %[r], r6, r6\n\t" + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : - : "memory", "r4", "r5", "r6", "r8" + : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10" ); - - return c; + return (uint32_t)(size_t)r; } #endif /* WOLFSSL_SP_SMALL */ @@ -44704,12 +69119,12 @@ static int sp_1024_iszero_32(const sp_digit* a) static void sp_1024_proj_point_add_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*32; + sp_digit* t2 = t + 4*32; + sp_digit* t3 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod); @@ -44731,17 +69146,9 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); @@ -44760,20 +69167,31 @@ static void sp_1024_proj_point_add_32(sp_point_1024* r, sp_1024_mont_dbl_32(t3, y, p1024_mod); sp_1024_mont_sub_32(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(y, y, x, p1024_mod); + sp_1024_mont_sub_32(y, y, x, p1024_mod); sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(y, y, t5, p1024_mod); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -44819,12 +69237,12 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*32; - ctx->t3 = t + 4*32; - ctx->t4 = t + 6*32; - ctx->t5 = t + 8*32; - ctx->t6 = t + 10*32; + ctx->t6 = t; + ctx->t1 = t + 2*32; + ctx->t2 = t + 4*32; + ctx->t3 = t + 6*32; + ctx->t4 = t + 8*32; + ctx->t5 = t + 10*32; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -44931,7 +69349,7 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -44944,22 +69362,28 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -45119,8 +69543,6 @@ static int sp_1024_ecc_mulmod_fast_32(sp_point_1024* r, const sp_point_1024* g, } #if defined(FP_ECC) || !defined(WOLFSSL_SP_SMALL) -#define sp_1024_mont_dbl_lower_32 sp_1024_mont_dbl_32 -#define sp_1024_mont_tpl_lower_32 sp_1024_mont_tpl_32 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -45159,7 +69581,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45168,8 +69590,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -45189,7 +69611,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_32(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_32(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_32(a, t1, p1024_mod); + sp_1024_mont_tpl_32(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_32(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_32(b, t1, x, p1024_mod, p1024_mp_mod); @@ -45198,8 +69620,8 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i, sp_1024_mont_dbl_32(t2, b, p1024_mod); sp_1024_mont_sub_32(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_32(b, t2, p1024_mod); + sp_1024_mont_sub_32(t2, b, x, p1024_mod); + sp_1024_mont_dbl_32(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_32(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -45255,12 +69677,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*32; - sp_digit* t3 = t + 4*32; - sp_digit* t4 = t + 6*32; - sp_digit* t5 = t + 8*32; - sp_digit* t6 = t + 10*32; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*32; + sp_digit* t6 = t + 4*32; + sp_digit* t1 = t + 6*32; + sp_digit* t4 = t + 8*32; + sp_digit* t5 = t + 10*32; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -45276,13 +69698,9 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, sp_1024_proj_point_dbl_32(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod); @@ -45291,33 +69709,40 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_32(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_32(t1, t3, p1024_mod); - sp_1024_mont_sub_32(x, x, t1, p1024_mod); + sp_1024_mont_sqr_32(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_32(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_32(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_32(t5, t3, p1024_mod); + sp_1024_mont_sub_32(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_32(t3, t3, x, p1024_mod); + sp_1024_mont_sub_32(t3, t3, x, p1024_mod); sp_1024_mont_mul_32(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_32(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_32(y, t3, t5, p1024_mod); + sp_1024_mont_mul_32(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_32(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 32; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 32; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 32; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 32; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 32; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -49751,7 +74176,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c index d37927523..022df827c 100644 --- a/wolfcrypt/src/sp_int.c +++ b/wolfcrypt/src/sp_int.c @@ -1245,7 +1245,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, : [a] "r" (va), [b] "r" (vb), [c] "r" (vc) \ : "cc" \ ) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH >= 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH >= 7) /* Count leading zeros - instruction only available on ARMv7 and newer. */ #define SP_ASM_LZCNT(va, vn) \ __asm__ __volatile__ ( \ @@ -1272,7 +1272,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, sp_int_digit d) { sp_int_digit r = 0; -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) static const char debruijn32[32] = { 0, 31, 9, 30, 3, 8, 13, 29, 2, 5, 7, 21, 12, 24, 28, 19, 1, 10, 4, 14, 6, 22, 25, 20, 11, 15, 23, 26, 16, 27, 17, 18 @@ -1282,7 +1282,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, __asm__ __volatile__ ( /* Shift d so that top bit is set. */ -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, %[m]\n\t" "mov r5, %[d]\n\t" "orr r5, r5, r5, lsr #1\n\t" @@ -1291,8 +1291,8 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "orr r5, r5, r5, lsr #8\n\t" "orr r5, r5, r5, lsr #16\n\t" "add r5, r5, #1\n\t" - "mul r5, r5, r4\n\t" - "lsr r5, r5, #27\n\t" + "mul r6, r5, r4\n\t" + "lsr r5, r6, #27\n\t" "ldrb r5, [%[t], r5]\n\t" #else "clz r5, %[d]\n\t" @@ -1352,7 +1352,7 @@ static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo, "sbc r8, r8, r8\n\t" "sub %[r], %[r], r8\n\t" : [r] "+r" (r), [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d) -#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7) +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) : [t] "r" (debruijn32), [m] "m" (debruijn32_mul) #else : diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 4b63aa7fc..c53dab1ff 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -52,6 +52,15 @@ #include +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif + #ifdef WOLFSSL_SP_X86_64_ASM #define SP_PRINT_NUM(var, name, total, words, bits) \ do { \ @@ -116,14 +125,14 @@ static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -3079,14 +3088,14 @@ static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -5977,14 +5986,14 @@ static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -8184,14 +8193,14 @@ static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -8591,14 +8600,14 @@ extern void sp_256_mont_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, #ifdef __cplusplus extern "C" { #endif -extern void sp_256_mont_sub_lower_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); +extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m); #ifdef __cplusplus } #endif #ifdef __cplusplus extern "C" { #endif -extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m); +extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); #ifdef __cplusplus } #endif @@ -8651,12 +8660,10 @@ static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p, sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod); /* X = T1 * T1 */ sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod); - /* X = X - Y */ - sp_256_mont_sub_4(x, x, y, p256_mod); - /* X = X - Y */ - sp_256_mont_sub_4(x, x, y, p256_mod); + /* X = X - 2*Y */ + sp_256_mont_sub_dbl_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_sub_4(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -8767,18 +8774,16 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con ctx->state = 14; break; case 14: - /* X = X - Y */ - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* X = X - 2*Y */ + sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 15; break; case 15: - /* X = X - Y */ - sp_256_mont_sub_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 16; break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -8806,20 +8811,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con #ifdef __cplusplus extern "C" { #endif -extern void sp_256_mont_tpl_lower_4(sp_digit* r, const sp_digit* a, const sp_digit* m); -#ifdef __cplusplus -} -#endif -#ifdef __cplusplus -extern "C" { -#endif -extern void sp_256_mont_sub_dbl_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); -#ifdef __cplusplus -} -#endif -#ifdef __cplusplus -extern "C" { -#endif extern void sp_256_mont_dbl_sub_4(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); #ifdef __cplusplus } @@ -8861,7 +8852,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); @@ -8889,7 +8880,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); @@ -8945,12 +8936,12 @@ static int sp_256_iszero_4(const sp_digit* a) static void sp_256_proj_point_add_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*4; + sp_digit* t2 = t + 4*4; + sp_digit* t3 = t + 6*4; + sp_digit* t4 = t + 8*4; + sp_digit* t5 = t + 10*4; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod); @@ -8972,17 +8963,9 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_4(t2, t2, t1, p256_mod); @@ -9000,20 +8983,31 @@ static void sp_256_proj_point_add_4(sp_point_256* r, sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod); sp_256_mont_sub_dbl_4(x, x, y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_4(y, y, x, p256_mod); + sp_256_mont_sub_4(y, y, x, p256_mod); sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_4(y, y, t5, p256_mod); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -9059,12 +9053,12 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*4; - ctx->t3 = t + 4*4; - ctx->t4 = t + 6*4; - ctx->t5 = t + 8*4; - ctx->t6 = t + 10*4; + ctx->t6 = t; + ctx->t1 = t + 2*4; + ctx->t2 = t + 4*4; + ctx->t3 = t + 6*4; + ctx->t4 = t + 8*4; + ctx->t5 = t + 10*4; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -9170,7 +9164,7 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -9183,22 +9177,28 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -9256,7 +9256,7 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_4(a, t1, p256_mod); + sp_256_mont_tpl_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_4(b, t1, x, p256_mod, p256_mp_mod); @@ -9353,8 +9353,8 @@ static void sp_256_proj_point_add_sub_4(sp_point_256* ra, sp_256_mont_sub_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_lower_4(ys, ya, xs, p256_mod); - sp_256_mont_sub_lower_4(ya, ya, xa, p256_mod); + sp_256_mont_sub_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_4(ya, ya, xa, p256_mod); sp_256_mont_mul_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_4(ys, ys, t6, p256_mod, p256_mp_mod); @@ -9488,7 +9488,7 @@ static int sp_256_ecc_mulmod_win_add_sub_4(sp_point_256* r, const sp_point_256* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * + t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -9757,7 +9757,6 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p, #define sp_256_mont_dbl_avx2_4 sp_256_mont_dbl_4 #define sp_256_mont_tpl_avx2_4 sp_256_mont_tpl_4 #define sp_256_mont_sub_avx2_4 sp_256_mont_sub_4 -#define sp_256_mont_sub_lower_avx2_4 sp_256_mont_sub_lower_4 #ifdef __cplusplus extern "C" { #endif @@ -9765,6 +9764,7 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m #ifdef __cplusplus } #endif +#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4 /* Double the Montgomery form projective point p. * * r Result of doubling point. @@ -9814,12 +9814,10 @@ static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p, sp_256_mont_mul_avx2_4(y, y, p->x, p256_mod, p256_mp_mod); /* X = T1 * T1 */ sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod); - /* X = X - Y */ - sp_256_mont_sub_avx2_4(x, x, y, p256_mod); - /* X = X - Y */ - sp_256_mont_sub_avx2_4(x, x, y, p256_mod); + /* X = X - 2*Y */ + sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod); /* Y = Y - X */ - sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod); + sp_256_mont_sub_avx2_4(y, y, x, p256_mod); /* Y = Y * T1 */ sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod); /* Y = Y - T2 */ @@ -9930,18 +9928,16 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r ctx->state = 14; break; case 14: - /* X = X - Y */ - sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); + /* X = X - 2*Y */ + sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 15; break; case 15: - /* X = X - Y */ - sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod); ctx->state = 16; break; case 16: /* Y = Y - X */ - sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 17; break; case 17: @@ -9966,8 +9962,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_256_mont_tpl_lower_avx2_4 sp_256_mont_tpl_lower_4 -#define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4 #define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4 /* Double the Montgomery form projective point p a number of times. * @@ -10006,7 +10000,7 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_avx2_4(a, t1, p256_mod); + sp_256_mont_tpl_avx2_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_avx2_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); @@ -10034,7 +10028,7 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_avx2_4(a, t1, p256_mod); + sp_256_mont_tpl_avx2_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_avx2_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); @@ -10066,12 +10060,12 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i, static void sp_256_proj_point_add_avx2_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*4; + sp_digit* t2 = t + 4*4; + sp_digit* t3 = t + 6*4; + sp_digit* t4 = t + 8*4; + sp_digit* t5 = t + 10*4; /* U1 = X1*Z2^2 */ sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod); @@ -10093,17 +10087,9 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r, sp_256_proj_point_dbl_avx2_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod); @@ -10121,20 +10107,31 @@ static void sp_256_proj_point_add_avx2_4(sp_point_256* r, sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod); sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod); + sp_256_mont_sub_avx2_4(y, y, x, p256_mod); sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(y, y, t5, p256_mod); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -10180,12 +10177,12 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*4; - ctx->t3 = t + 4*4; - ctx->t4 = t + 6*4; - ctx->t5 = t + 8*4; - ctx->t6 = t + 10*4; + ctx->t6 = t; + ctx->t1 = t + 2*4; + ctx->t2 = t + 4*4; + ctx->t3 = t + 6*4; + ctx->t4 = t + 8*4; + ctx->t5 = t + 10*4; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -10291,7 +10288,7 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); + sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod); ctx->state = 22; break; case 22: @@ -10304,22 +10301,28 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -10377,7 +10380,7 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r, /* A = 3*(X^2 - W) */ sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod); sp_256_mont_sub_avx2_4(t1, t1, w, p256_mod); - sp_256_mont_tpl_lower_avx2_4(a, t1, p256_mod); + sp_256_mont_tpl_avx2_4(a, t1, p256_mod); /* B = X*Y^2 */ sp_256_mont_sqr_avx2_4(t1, y, p256_mod, p256_mp_mod); sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod); @@ -10474,8 +10477,8 @@ static void sp_256_proj_point_add_sub_avx2_4(sp_point_256* ra, sp_256_mont_sub_avx2_4(xs, xs, t1, p256_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_256_mont_sub_lower_avx2_4(ys, ya, xs, p256_mod); - sp_256_mont_sub_lower_avx2_4(ya, ya, xa, p256_mod); + sp_256_mont_sub_avx2_4(ys, ya, xs, p256_mod); + sp_256_mont_sub_avx2_4(ya, ya, xa, p256_mod); sp_256_mont_mul_avx2_4(ya, ya, t4, p256_mod, p256_mp_mod); sp_256_sub_4(t6, p256_mod, t6); sp_256_mont_mul_avx2_4(ys, ys, t6, p256_mod, p256_mp_mod); @@ -10524,7 +10527,7 @@ static int sp_256_ecc_mulmod_win_add_sub_avx2_4(sp_point_256* r, const sp_point_ (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * + t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -10650,12 +10653,11 @@ typedef struct sp_table_entry_256 { static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*4; + sp_digit* t6 = t + 4*4; + sp_digit* t1 = t + 6*4; + sp_digit* t4 = t + 8*4; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -10671,13 +10673,9 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, sp_256_proj_point_dbl_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_4(t2, t2, p->x, p256_mod); @@ -10686,32 +10684,39 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_4(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(x, t1, t5, p256_mod); - sp_256_mont_sub_dbl_4(x, x, t3, p256_mod); + sp_256_mont_sqr_4(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_4(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_4(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(t2, t2, t1, p256_mod); + sp_256_mont_sub_dbl_4(x, t2, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_4(t3, t3, x, p256_mod); + sp_256_mont_sub_4(t3, t3, x, p256_mod); sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_4(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_4(y, t3, t5, p256_mod); + sp_256_mont_mul_4(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_4(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -10869,7 +10874,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 6]; + sp_digit t[2 * 4 * 5]; #endif sp_point_256* p = NULL; int i; @@ -10890,7 +10895,7 @@ static int sp_256_ecc_mulmod_stripe_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -11074,13 +11079,13 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ #ifdef WOLFSSL_SP_SMALL_STACK sp_digit* tmp; #else - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_cache_256_t* cache; int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; } @@ -11137,12 +11142,11 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256* p, const sp_point_256* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*4; - sp_digit* t3 = t + 4*4; - sp_digit* t4 = t + 6*4; - sp_digit* t5 = t + 8*4; - sp_digit* t6 = t + 10*4; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*4; + sp_digit* t6 = t + 4*4; + sp_digit* t1 = t + 6*4; + sp_digit* t4 = t + 8*4; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -11158,13 +11162,9 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, sp_256_proj_point_dbl_avx2_4(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_256_mont_sub_avx2_4(t2, t2, p->x, p256_mod); @@ -11173,32 +11173,39 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, /* Z3 = H*Z1 */ sp_256_mont_mul_avx2_4(z, p->z, t2, p256_mod, p256_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_256_mont_sqr_avx2_4(t1, t4, p256_mod, p256_mp_mod); - sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t3, p->x, t5, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod); - sp_256_mont_sub_avx2_4(x, t1, t5, p256_mod); - sp_256_mont_sub_dbl_avx2_4(x, x, t3, p256_mod); + sp_256_mont_sqr_avx2_4(t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t3, p->x, t1, p256_mod, p256_mp_mod); + sp_256_mont_mul_avx2_4(t1, t1, t2, p256_mod, p256_mp_mod); + sp_256_mont_sqr_avx2_4(t2, t4, p256_mod, p256_mp_mod); + sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod); + sp_256_mont_sub_dbl_avx2_4(x, t2, t3, p256_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_256_mont_sub_lower_avx2_4(t3, t3, x, p256_mod); + sp_256_mont_sub_avx2_4(t3, t3, x, p256_mod); sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod); - sp_256_mont_mul_avx2_4(t5, t5, p->y, p256_mod, p256_mp_mod); - sp_256_mont_sub_avx2_4(y, t3, t5, p256_mod); + sp_256_mont_mul_avx2_4(t1, t1, p->y, p256_mod, p256_mp_mod); + sp_256_mont_sub_avx2_4(y, t3, t1, p256_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 4; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 4; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 4; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 4; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 4; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -11342,7 +11349,7 @@ static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point_256* r, const sp_point_256* sp_digit* t = NULL; #else sp_point_256 rt[2]; - sp_digit t[2 * 4 * 6]; + sp_digit t[2 * 4 * 5]; #endif sp_point_256* p = NULL; int i; @@ -11363,7 +11370,7 @@ static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point_256* r, const sp_point_256* if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -11454,13 +11461,13 @@ static int sp_256_ecc_mulmod_avx2_4(sp_point_256* r, const sp_point_256* g, cons #ifdef WOLFSSL_SP_SMALL_STACK sp_digit* tmp; #else - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_cache_256_t* cache; int err = MP_OKAY; #ifdef WOLFSSL_SP_SMALL_STACK - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, DYNAMIC_TYPE_ECC); + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) { err = MEMORY_E; } @@ -11582,7 +11589,7 @@ int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_256* point = NULL; + sp_point_256* point = NULL; sp_digit* k = NULL; #else sp_point_256 point[2]; @@ -24101,7 +24108,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -24120,7 +24127,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -24179,7 +24186,7 @@ static int sp_256_ecc_mulmod_add_only_4(sp_point_256* r, const sp_point_256* g, if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); #ifdef WOLFSSL_SP_SMALL_STACK XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -24235,7 +24242,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 sp_digit* tmp = NULL; #else sp_point_256 rt[2]; - sp_digit tmp[2 * 4 * 6]; + sp_digit tmp[2 * 4 * 5]; #endif sp_point_256* p = NULL; sp_digit* negy = NULL; @@ -24254,7 +24261,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 if (rt == NULL) err = MEMORY_E; if (err == MP_OKAY) { - tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap, + tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap, DYNAMIC_TYPE_ECC); if (tmp == NULL) err = MEMORY_E; @@ -24313,7 +24320,7 @@ static int sp_256_ecc_mulmod_add_only_avx2_4(sp_point_256* r, const sp_point_256 if (tmp != NULL) #endif { - ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 6); + ForceZero(tmp, sizeof(sp_digit) * 2 * 4 * 5); #ifdef WOLFSSL_SP_SMALL_STACK XFREE(tmp, heap, DYNAMIC_TYPE_ECC); #endif @@ -24434,7 +24441,7 @@ int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am, #endif #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -24600,7 +24607,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_256* infinity = NULL; #endif int err = MP_OKAY; - + #ifdef HAVE_INTEL_AVX2 word32 cpuid_flags = cpuid_get_flags(); #endif @@ -24611,7 +24618,7 @@ int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -27289,14 +27296,14 @@ static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -27728,13 +27735,6 @@ extern void sp_384_mont_sub_6(sp_digit* r, const sp_digit* a, const sp_digit* b, #ifdef __cplusplus extern "C" { #endif -extern void sp_384_mont_sub_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* b, const sp_digit* m); -#ifdef __cplusplus -} -#endif -#ifdef __cplusplus -extern "C" { -#endif extern void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m); #ifdef __cplusplus } @@ -27793,7 +27793,7 @@ static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_sub_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -27915,7 +27915,7 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -27940,20 +27940,6 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#ifdef __cplusplus -extern "C" { -#endif -extern void sp_384_mont_dbl_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* m); -#ifdef __cplusplus -} -#endif -#ifdef __cplusplus -extern "C" { -#endif -extern void sp_384_mont_tpl_lower_6(sp_digit* r, const sp_digit* a, const sp_digit* m); -#ifdef __cplusplus -} -#endif /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -27992,7 +27978,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -28001,8 +27987,8 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -28022,7 +28008,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -28031,8 +28017,8 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -28080,12 +28066,12 @@ static int sp_384_iszero_6(const sp_digit* a) static void sp_384_proj_point_add_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*6; + sp_digit* t2 = t + 4*6; + sp_digit* t3 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod); @@ -28107,17 +28093,9 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_6(t2, t2, t1, p384_mod); @@ -28136,20 +28114,31 @@ static void sp_384_proj_point_add_6(sp_point_384* r, sp_384_mont_dbl_6(t3, y, p384_mod); sp_384_mont_sub_6(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_6(y, y, x, p384_mod); + sp_384_mont_sub_6(y, y, x, p384_mod); sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_6(y, y, t5, p384_mod); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -28195,12 +28184,12 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*6; - ctx->t3 = t + 4*6; - ctx->t4 = t + 6*6; - ctx->t5 = t + 8*6; - ctx->t6 = t + 10*6; + ctx->t6 = t; + ctx->t1 = t + 2*6; + ctx->t2 = t + 4*6; + ctx->t3 = t + 6*6; + ctx->t4 = t + 8*6; + ctx->t5 = t + 10*6; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -28307,7 +28296,7 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -28320,22 +28309,28 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -28394,7 +28389,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_6(a, t1, p384_mod); + sp_384_mont_tpl_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_6(b, t1, x, p384_mod, p384_mp_mod); @@ -28404,8 +28399,8 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r, sp_384_mont_dbl_6(t2, b, p384_mod); sp_384_mont_sub_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_6(b, t2, p384_mod); + sp_384_mont_sub_6(t2, b, x, p384_mod); + sp_384_mont_dbl_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; @@ -28493,8 +28488,8 @@ static void sp_384_proj_point_add_sub_6(sp_point_384* ra, sp_384_mont_sub_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_lower_6(ys, ya, xs, p384_mod); - sp_384_mont_sub_lower_6(ya, ya, xa, p384_mod); + sp_384_mont_sub_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_6(ya, ya, xa, p384_mod); sp_384_mont_mul_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_6(ys, ys, t6, p384_mod, p384_mp_mod); @@ -28628,7 +28623,7 @@ static int sp_384_ecc_mulmod_win_add_sub_6(sp_point_384* r, const sp_point_384* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * + t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -28933,7 +28928,6 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p, #define sp_384_mont_dbl_avx2_6 sp_384_mont_dbl_6 #define sp_384_mont_tpl_avx2_6 sp_384_mont_tpl_6 #define sp_384_mont_sub_avx2_6 sp_384_mont_sub_6 -#define sp_384_mont_sub_lower_avx2_6 sp_384_mont_sub_lower_6 #ifdef __cplusplus extern "C" { #endif @@ -28995,7 +28989,7 @@ static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p, /* X = X - Y */ sp_384_mont_sub_avx2_6(x, x, y, p384_mod); /* Y = Y - X */ - sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod); + sp_384_mont_sub_avx2_6(y, y, x, p384_mod); /* Y = Y * T1 */ sp_384_mont_mul_avx2_6(y, y, t1, p384_mod, p384_mp_mod); /* Y = Y - T2 */ @@ -29117,7 +29111,7 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 16: /* Y = Y - X */ - sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 17; break; case 17: @@ -29142,8 +29136,6 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_384_mont_dbl_lower_avx2_6 sp_384_mont_dbl_lower_6 -#define sp_384_mont_tpl_lower_avx2_6 sp_384_mont_tpl_lower_6 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -29182,7 +29174,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); @@ -29191,8 +29183,8 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i, sp_384_mont_dbl_avx2_6(t2, b, p384_mod); sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); + sp_384_mont_sub_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -29212,7 +29204,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); @@ -29221,8 +29213,8 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i, sp_384_mont_dbl_avx2_6(t2, b, p384_mod); sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); + sp_384_mont_sub_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(z, z, y, p384_mod, p384_mp_mod); /* t1 = Y^4 */ @@ -29246,12 +29238,12 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i, static void sp_384_proj_point_add_avx2_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*6; + sp_digit* t2 = t + 4*6; + sp_digit* t3 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* U1 = X1*Z2^2 */ sp_384_mont_sqr_avx2_6(t1, q->z, p384_mod, p384_mp_mod); @@ -29273,17 +29265,9 @@ static void sp_384_proj_point_add_avx2_6(sp_point_384* r, sp_384_proj_point_dbl_avx2_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_384_mont_sub_avx2_6(t2, t2, t1, p384_mod); @@ -29302,20 +29286,31 @@ static void sp_384_proj_point_add_avx2_6(sp_point_384* r, sp_384_mont_dbl_avx2_6(t3, y, p384_mod); sp_384_mont_sub_avx2_6(x, x, t3, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod); + sp_384_mont_sub_avx2_6(y, y, x, p384_mod); sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(y, y, t5, p384_mod); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -29361,12 +29356,12 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*6; - ctx->t3 = t + 4*6; - ctx->t4 = t + 6*6; - ctx->t5 = t + 8*6; - ctx->t6 = t + 10*6; + ctx->t6 = t; + ctx->t1 = t + 2*6; + ctx->t2 = t + 4*6; + ctx->t3 = t + 6*6; + ctx->t4 = t + 8*6; + ctx->t5 = t + 10*6; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -29473,7 +29468,7 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); + sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod); ctx->state = 22; break; case 22: @@ -29486,22 +29481,28 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -29560,7 +29561,7 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r, /* A = 3*(X^2 - W) */ sp_384_mont_sqr_avx2_6(t1, x, p384_mod, p384_mp_mod); sp_384_mont_sub_avx2_6(t1, t1, w, p384_mod); - sp_384_mont_tpl_lower_avx2_6(a, t1, p384_mod); + sp_384_mont_tpl_avx2_6(a, t1, p384_mod); /* B = X*Y^2 */ sp_384_mont_sqr_avx2_6(t1, y, p384_mod, p384_mp_mod); sp_384_mont_mul_avx2_6(b, t1, x, p384_mod, p384_mp_mod); @@ -29570,8 +29571,8 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r, sp_384_mont_dbl_avx2_6(t2, b, p384_mod); sp_384_mont_sub_avx2_6(x, x, t2, p384_mod); /* B = 2.(B - X) */ - sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod); - sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod); + sp_384_mont_sub_avx2_6(t2, b, x, p384_mod); + sp_384_mont_dbl_avx2_6(b, t2, p384_mod); /* Z = Z*Y */ sp_384_mont_mul_avx2_6(r[j].z, z, y, p384_mod, p384_mp_mod); z = r[j].z; @@ -29659,8 +29660,8 @@ static void sp_384_proj_point_add_sub_avx2_6(sp_point_384* ra, sp_384_mont_sub_avx2_6(xs, xs, t1, p384_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_384_mont_sub_lower_avx2_6(ys, ya, xs, p384_mod); - sp_384_mont_sub_lower_avx2_6(ya, ya, xa, p384_mod); + sp_384_mont_sub_avx2_6(ys, ya, xs, p384_mod); + sp_384_mont_sub_avx2_6(ya, ya, xa, p384_mod); sp_384_mont_mul_avx2_6(ya, ya, t4, p384_mod, p384_mp_mod); sp_384_sub_6(t6, p384_mod, t6); sp_384_mont_mul_avx2_6(ys, ys, t6, p384_mod, p384_mp_mod); @@ -29709,7 +29710,7 @@ static int sp_384_ecc_mulmod_win_add_sub_avx2_6(sp_point_384* r, const sp_point_ (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * + t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -29835,12 +29836,12 @@ typedef struct sp_table_entry_384 { static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*6; + sp_digit* t6 = t + 4*6; + sp_digit* t1 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -29856,13 +29857,9 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, sp_384_proj_point_dbl_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_6(t2, t2, p->x, p384_mod); @@ -29871,33 +29868,40 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_6(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(x, t1, t5, p384_mod); - sp_384_mont_dbl_6(t1, t3, p384_mod); - sp_384_mont_sub_6(x, x, t1, p384_mod); + sp_384_mont_sqr_6(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_6(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_6(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(t2, t2, t1, p384_mod); + sp_384_mont_dbl_6(t5, t3, p384_mod); + sp_384_mont_sub_6(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_6(t3, t3, x, p384_mod); + sp_384_mont_sub_6(t3, t3, x, p384_mod); sp_384_mont_mul_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_6(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_6(y, t3, t5, p384_mod); + sp_384_mont_mul_6(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_6(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -30323,12 +30327,12 @@ static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384* p, const sp_point_384* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*6; - sp_digit* t3 = t + 4*6; - sp_digit* t4 = t + 6*6; - sp_digit* t5 = t + 8*6; - sp_digit* t6 = t + 10*6; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*6; + sp_digit* t6 = t + 4*6; + sp_digit* t1 = t + 6*6; + sp_digit* t4 = t + 8*6; + sp_digit* t5 = t + 10*6; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -30344,13 +30348,9 @@ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, sp_384_proj_point_dbl_avx2_6(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_384_mont_sub_avx2_6(t2, t2, p->x, p384_mod); @@ -30359,33 +30359,40 @@ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, /* Z3 = H*Z1 */ sp_384_mont_mul_avx2_6(z, p->z, t2, p384_mod, p384_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_384_mont_sqr_avx2_6(t1, t4, p384_mod, p384_mp_mod); - sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t3, p->x, t5, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod); - sp_384_mont_sub_avx2_6(x, t1, t5, p384_mod); - sp_384_mont_dbl_avx2_6(t1, t3, p384_mod); - sp_384_mont_sub_avx2_6(x, x, t1, p384_mod); + sp_384_mont_sqr_avx2_6(t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t3, p->x, t1, p384_mod, p384_mp_mod); + sp_384_mont_mul_avx2_6(t1, t1, t2, p384_mod, p384_mp_mod); + sp_384_mont_sqr_avx2_6(t2, t4, p384_mod, p384_mp_mod); + sp_384_mont_sub_avx2_6(t2, t2, t1, p384_mod); + sp_384_mont_dbl_avx2_6(t5, t3, p384_mod); + sp_384_mont_sub_avx2_6(x, t2, t5, p384_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_384_mont_sub_lower_avx2_6(t3, t3, x, p384_mod); + sp_384_mont_sub_avx2_6(t3, t3, x, p384_mod); sp_384_mont_mul_avx2_6(t3, t3, t4, p384_mod, p384_mp_mod); - sp_384_mont_mul_avx2_6(t5, t5, p->y, p384_mod, p384_mp_mod); - sp_384_mont_sub_avx2_6(y, t3, t5, p384_mod); + sp_384_mont_mul_avx2_6(t1, t1, p->y, p384_mod, p384_mp_mod); + sp_384_mont_sub_avx2_6(y, t3, t1, p384_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 6; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 6; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 6; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 6; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 6; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -30769,7 +30776,7 @@ int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_384* point = NULL; + sp_point_384* point = NULL; sp_digit* k = NULL; #else sp_point_384 point[2]; @@ -49435,7 +49442,7 @@ int sp_ecc_mulmod_base_add_384(const mp_int* km, const ecc_point* am, #endif #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -49601,7 +49608,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_384* infinity = NULL; #endif int err = MP_OKAY; - + #ifdef HAVE_INTEL_AVX2 word32 cpuid_flags = cpuid_get_flags(); #endif @@ -49612,7 +49619,7 @@ int sp_ecc_make_key_384(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -52203,14 +52210,14 @@ static void sp_521_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -52620,7 +52627,6 @@ extern void sp_521_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b, #ifdef __cplusplus } #endif -#define sp_521_mont_sub_lower_9 sp_521_mont_sub_9 #ifdef __cplusplus extern "C" { #endif @@ -52682,7 +52688,7 @@ static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -52804,7 +52810,7 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -52829,8 +52835,6 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9 -#define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -52869,7 +52873,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -52878,8 +52882,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -52899,7 +52903,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -52908,8 +52912,8 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -52959,12 +52963,12 @@ static int sp_521_iszero_9(const sp_digit* a) static void sp_521_proj_point_add_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*9; + sp_digit* t2 = t + 4*9; + sp_digit* t3 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod); @@ -52986,17 +52990,9 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_9(t2, t2, t1, p521_mod); @@ -53015,20 +53011,31 @@ static void sp_521_proj_point_add_9(sp_point_521* r, sp_521_mont_dbl_9(t3, y, p521_mod); sp_521_mont_sub_9(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(y, y, x, p521_mod); + sp_521_mont_sub_9(y, y, x, p521_mod); sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_9(y, y, t5, p521_mod); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -53074,12 +53081,12 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*9; - ctx->t3 = t + 4*9; - ctx->t4 = t + 6*9; - ctx->t5 = t + 8*9; - ctx->t6 = t + 10*9; + ctx->t6 = t; + ctx->t1 = t + 2*9; + ctx->t2 = t + 4*9; + ctx->t3 = t + 6*9; + ctx->t4 = t + 8*9; + ctx->t5 = t + 10*9; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -53186,7 +53193,7 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -53199,22 +53206,28 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -53273,7 +53286,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_9(a, t1, p521_mod); + sp_521_mont_tpl_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_9(b, t1, x, p521_mod, p521_mp_mod); @@ -53283,8 +53296,8 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r, sp_521_mont_dbl_9(t2, b, p521_mod); sp_521_mont_sub_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_9(b, t2, p521_mod); + sp_521_mont_sub_9(t2, b, x, p521_mod); + sp_521_mont_dbl_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; @@ -53372,8 +53385,8 @@ static void sp_521_proj_point_add_sub_9(sp_point_521* ra, sp_521_mont_sub_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_lower_9(ys, ya, xs, p521_mod); - sp_521_mont_sub_lower_9(ya, ya, xa, p521_mod); + sp_521_mont_sub_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_9(ya, ya, xa, p521_mod); sp_521_mont_mul_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_9(ys, ys, t6, p521_mod, p521_mp_mod); @@ -53507,7 +53520,7 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521* (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * + t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -53789,7 +53802,6 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p, #define sp_521_mont_dbl_avx2_9 sp_521_mont_dbl_9 #define sp_521_mont_tpl_avx2_9 sp_521_mont_tpl_9 #define sp_521_mont_sub_avx2_9 sp_521_mont_sub_9 -#define sp_521_mont_sub_lower_avx2_9 sp_521_mont_sub_avx2_9 #ifdef __cplusplus extern "C" { #endif @@ -53851,7 +53863,7 @@ static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p, /* X = X - Y */ sp_521_mont_sub_avx2_9(x, x, y, p521_mod); /* Y = Y - X */ - sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod); + sp_521_mont_sub_avx2_9(y, y, x, p521_mod); /* Y = Y * T1 */ sp_521_mont_mul_avx2_9(y, y, t1, p521_mod, p521_mp_mod); /* Y = Y - T2 */ @@ -53973,7 +53985,7 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 16: /* Y = Y - X */ - sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 17; break; case 17: @@ -53998,8 +54010,6 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_521_mont_dbl_lower_avx2_9 sp_521_mont_dbl_avx2_9 -#define sp_521_mont_tpl_lower_avx2_9 sp_521_mont_tpl_avx2_9 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -54038,7 +54048,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); @@ -54047,8 +54057,8 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i, sp_521_mont_dbl_avx2_9(t2, b, p521_mod); sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); + sp_521_mont_sub_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -54068,7 +54078,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); @@ -54077,8 +54087,8 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i, sp_521_mont_dbl_avx2_9(t2, b, p521_mod); sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); + sp_521_mont_sub_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(z, z, y, p521_mod, p521_mp_mod); /* t1 = Y^4 */ @@ -54102,12 +54112,12 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i, static void sp_521_proj_point_add_avx2_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*9; + sp_digit* t2 = t + 4*9; + sp_digit* t3 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* U1 = X1*Z2^2 */ sp_521_mont_sqr_avx2_9(t1, q->z, p521_mod, p521_mp_mod); @@ -54129,17 +54139,9 @@ static void sp_521_proj_point_add_avx2_9(sp_point_521* r, sp_521_proj_point_dbl_avx2_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_521_mont_sub_avx2_9(t2, t2, t1, p521_mod); @@ -54158,20 +54160,31 @@ static void sp_521_proj_point_add_avx2_9(sp_point_521* r, sp_521_mont_dbl_avx2_9(t3, y, p521_mod); sp_521_mont_sub_avx2_9(x, x, t3, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod); + sp_521_mont_sub_avx2_9(y, y, x, p521_mod); sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(y, y, t5, p521_mod); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -54217,12 +54230,12 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*9; - ctx->t3 = t + 4*9; - ctx->t4 = t + 6*9; - ctx->t5 = t + 8*9; - ctx->t6 = t + 10*9; + ctx->t6 = t; + ctx->t1 = t + 2*9; + ctx->t2 = t + 4*9; + ctx->t3 = t + 6*9; + ctx->t4 = t + 8*9; + ctx->t5 = t + 10*9; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -54329,7 +54342,7 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); + sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod); ctx->state = 22; break; case 22: @@ -54342,22 +54355,28 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -54416,7 +54435,7 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r, /* A = 3*(X^2 - W) */ sp_521_mont_sqr_avx2_9(t1, x, p521_mod, p521_mp_mod); sp_521_mont_sub_avx2_9(t1, t1, w, p521_mod); - sp_521_mont_tpl_lower_avx2_9(a, t1, p521_mod); + sp_521_mont_tpl_avx2_9(a, t1, p521_mod); /* B = X*Y^2 */ sp_521_mont_sqr_avx2_9(t1, y, p521_mod, p521_mp_mod); sp_521_mont_mul_avx2_9(b, t1, x, p521_mod, p521_mp_mod); @@ -54426,8 +54445,8 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r, sp_521_mont_dbl_avx2_9(t2, b, p521_mod); sp_521_mont_sub_avx2_9(x, x, t2, p521_mod); /* B = 2.(B - X) */ - sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod); - sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod); + sp_521_mont_sub_avx2_9(t2, b, x, p521_mod); + sp_521_mont_dbl_avx2_9(b, t2, p521_mod); /* Z = Z*Y */ sp_521_mont_mul_avx2_9(r[j].z, z, y, p521_mod, p521_mp_mod); z = r[j].z; @@ -54515,8 +54534,8 @@ static void sp_521_proj_point_add_sub_avx2_9(sp_point_521* ra, sp_521_mont_sub_avx2_9(xs, xs, t1, p521_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_521_mont_sub_lower_avx2_9(ys, ya, xs, p521_mod); - sp_521_mont_sub_lower_avx2_9(ya, ya, xa, p521_mod); + sp_521_mont_sub_avx2_9(ys, ya, xs, p521_mod); + sp_521_mont_sub_avx2_9(ya, ya, xa, p521_mod); sp_521_mont_mul_avx2_9(ya, ya, t4, p521_mod, p521_mp_mod); sp_521_sub_9(t6, p521_mod, t6); sp_521_mont_mul_avx2_9(ys, ys, t6, p521_mod, p521_mp_mod); @@ -54565,7 +54584,7 @@ static int sp_521_ecc_mulmod_win_add_sub_avx2_9(sp_point_521* r, const sp_point_ (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * + t = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * (33+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -54691,12 +54710,12 @@ typedef struct sp_table_entry_521 { static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*9; + sp_digit* t6 = t + 4*9; + sp_digit* t1 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -54712,13 +54731,9 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, sp_521_proj_point_dbl_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_9(t2, t2, p->x, p521_mod); @@ -54727,33 +54742,40 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_9(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(x, t1, t5, p521_mod); - sp_521_mont_dbl_9(t1, t3, p521_mod); - sp_521_mont_sub_9(x, x, t1, p521_mod); + sp_521_mont_sqr_9(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_9(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_9(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(t2, t2, t1, p521_mod); + sp_521_mont_dbl_9(t5, t3, p521_mod); + sp_521_mont_sub_9(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_9(t3, t3, x, p521_mod); + sp_521_mont_sub_9(t3, t3, x, p521_mod); sp_521_mont_mul_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_9(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_9(y, t3, t5, p521_mod); + sp_521_mont_mul_9(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_9(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -55179,12 +55201,12 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521* p, const sp_point_521* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*9; - sp_digit* t3 = t + 4*9; - sp_digit* t4 = t + 6*9; - sp_digit* t5 = t + 8*9; - sp_digit* t6 = t + 10*9; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*9; + sp_digit* t6 = t + 4*9; + sp_digit* t1 = t + 6*9; + sp_digit* t4 = t + 8*9; + sp_digit* t5 = t + 10*9; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -55200,13 +55222,9 @@ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, sp_521_proj_point_dbl_avx2_9(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_521_mont_sub_avx2_9(t2, t2, p->x, p521_mod); @@ -55215,33 +55233,40 @@ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, /* Z3 = H*Z1 */ sp_521_mont_mul_avx2_9(z, p->z, t2, p521_mod, p521_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_521_mont_sqr_avx2_9(t1, t4, p521_mod, p521_mp_mod); - sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t3, p->x, t5, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod); - sp_521_mont_sub_avx2_9(x, t1, t5, p521_mod); - sp_521_mont_dbl_avx2_9(t1, t3, p521_mod); - sp_521_mont_sub_avx2_9(x, x, t1, p521_mod); + sp_521_mont_sqr_avx2_9(t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t3, p->x, t1, p521_mod, p521_mp_mod); + sp_521_mont_mul_avx2_9(t1, t1, t2, p521_mod, p521_mp_mod); + sp_521_mont_sqr_avx2_9(t2, t4, p521_mod, p521_mp_mod); + sp_521_mont_sub_avx2_9(t2, t2, t1, p521_mod); + sp_521_mont_dbl_avx2_9(t5, t3, p521_mod); + sp_521_mont_sub_avx2_9(x, t2, t5, p521_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_521_mont_sub_lower_avx2_9(t3, t3, x, p521_mod); + sp_521_mont_sub_avx2_9(t3, t3, x, p521_mod); sp_521_mont_mul_avx2_9(t3, t3, t4, p521_mod, p521_mp_mod); - sp_521_mont_mul_avx2_9(t5, t5, p->y, p521_mod, p521_mp_mod); - sp_521_mont_sub_avx2_9(y, t3, t5, p521_mod); + sp_521_mont_mul_avx2_9(t1, t1, p->y, p521_mod, p521_mp_mod); + sp_521_mont_sub_avx2_9(y, t3, t1, p521_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 9; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 9; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 9; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 9; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 9; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -55625,7 +55650,7 @@ int sp_ecc_mulmod_add_521(const mp_int* km, const ecc_point* gm, const ecc_point* am, int inMont, ecc_point* r, int map, void* heap) { #ifdef WOLFSSL_SP_SMALL_STACK - sp_point_521* point = NULL; + sp_point_521* point = NULL; sp_digit* k = NULL; #else sp_point_521 point[2]; @@ -90477,7 +90502,7 @@ int sp_ecc_mulmod_base_add_521(const mp_int* km, const ecc_point* am, #endif #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; @@ -90644,7 +90669,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) sp_point_521* infinity = NULL; #endif int err = MP_OKAY; - + #ifdef HAVE_INTEL_AVX2 word32 cpuid_flags = cpuid_get_flags(); #endif @@ -90655,7 +90680,7 @@ int sp_ecc_make_key_521(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap) #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN point = (sp_point_521*)XMALLOC(sizeof(sp_point_521) * 2, heap, DYNAMIC_TYPE_ECC); #else - point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); + point = (sp_point_521*)XMALLOC(sizeof(sp_point_521), heap, DYNAMIC_TYPE_ECC); #endif if (point == NULL) err = MEMORY_E; @@ -93488,14 +93513,14 @@ static void sp_1024_from_mp(sp_digit* r, int size, const mp_int* a) { #if DIGIT_BIT == 64 int i; - int j = 0; + sp_digit j = (sp_digit)0 - (sp_digit)a->used; + int o = 0; for (i = 0; i < size; i++) { - sp_digit mask = - (((sp_digit)((int)a->used - i - 1)) >> (SP_WORD_SIZE - 1)) - 1; - r[i] = a->dp[j] & mask; - j += (int)(((sp_digit)1) - - (((sp_digit)((int)a->used - i - 2)) >> (SP_WORD_SIZE - 1))); + sp_digit mask = (sp_digit)0 - (j >> 63); + r[i] = a->dp[o] & mask; + j++; + o += (int)(j >> 63); } #elif DIGIT_BIT > 64 unsigned int i; @@ -93852,7 +93877,6 @@ extern void sp_1024_mont_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* #ifdef __cplusplus } #endif -#define sp_1024_mont_sub_lower_16 sp_1024_mont_sub_16 #ifdef __cplusplus extern "C" { #endif @@ -93914,7 +93938,7 @@ static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p, /* X = X - Y */ sp_1024_mont_sub_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_sub_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -94036,7 +94060,7 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -94061,8 +94085,6 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16 -#define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -94101,7 +94123,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -94110,8 +94132,8 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -94131,7 +94153,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -94140,8 +94162,8 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -94201,12 +94223,12 @@ static int sp_1024_iszero_16(const sp_digit* a) static void sp_1024_proj_point_add_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*16; + sp_digit* t2 = t + 4*16; + sp_digit* t3 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod); @@ -94228,17 +94250,9 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); @@ -94257,20 +94271,31 @@ static void sp_1024_proj_point_add_16(sp_point_1024* r, sp_1024_mont_dbl_16(t3, y, p1024_mod); sp_1024_mont_sub_16(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_16(y, y, x, p1024_mod); + sp_1024_mont_sub_16(y, y, x, p1024_mod); sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(y, y, t5, p1024_mod); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -94316,12 +94341,12 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*16; - ctx->t3 = t + 4*16; - ctx->t4 = t + 6*16; - ctx->t5 = t + 8*16; - ctx->t6 = t + 10*16; + ctx->t6 = t; + ctx->t1 = t + 2*16; + ctx->t2 = t + 4*16; + ctx->t3 = t + 6*16; + ctx->t4 = t + 8*16; + ctx->t5 = t + 10*16; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -94428,7 +94453,7 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -94441,22 +94466,28 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -94515,7 +94546,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_16(a, t1, p1024_mod); + sp_1024_mont_tpl_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -94525,8 +94556,8 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r, sp_1024_mont_dbl_16(t2, b, p1024_mod); sp_1024_mont_sub_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_16(b, t2, p1024_mod); + sp_1024_mont_sub_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; @@ -94614,8 +94645,8 @@ static void sp_1024_proj_point_add_sub_16(sp_point_1024* ra, sp_1024_mont_sub_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_lower_16(ys, ya, xs, p1024_mod); - sp_1024_mont_sub_lower_16(ya, ya, xa, p1024_mod); + sp_1024_mont_sub_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_16(ya, ya, xa, p1024_mod); sp_1024_mont_mul_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_16(ys, ys, t6, p1024_mod, p1024_mp_mod); @@ -94743,7 +94774,7 @@ static int sp_1024_ecc_mulmod_win_add_sub_16(sp_point_1024* r, const sp_point_10 (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * + t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * (65+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -95001,7 +95032,6 @@ extern void sp_1024_mont_sub_avx2_16(sp_digit* r, const sp_digit* a, const sp_di #ifdef __cplusplus } #endif -#define sp_1024_mont_sub_lower_avx2_16 sp_1024_mont_sub_avx2_16 #ifdef __cplusplus extern "C" { #endif @@ -95063,7 +95093,7 @@ static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024 /* X = X - Y */ sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod); /* Y = Y - X */ - sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod); + sp_1024_mont_sub_avx2_16(y, y, x, p1024_mod); /* Y = Y * T1 */ sp_1024_mont_mul_avx2_16(y, y, t1, p1024_mod, p1024_mp_mod); /* Y = Y - T2 */ @@ -95185,7 +95215,7 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 16: /* Y = Y - X */ - sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 17; break; case 17: @@ -95210,8 +95240,6 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 return err; } #endif /* WOLFSSL_SP_NONBLOCK */ -#define sp_1024_mont_dbl_lower_avx2_16 sp_1024_mont_dbl_avx2_16 -#define sp_1024_mont_tpl_lower_avx2_16 sp_1024_mont_tpl_avx2_16 /* Double the Montgomery form projective point p a number of times. * * r Result of repeated doubling of point. @@ -95250,7 +95278,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -95259,8 +95287,8 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i, sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); + sp_1024_mont_sub_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -95280,7 +95308,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -95289,8 +95317,8 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i, sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); + sp_1024_mont_sub_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(z, z, y, p1024_mod, p1024_mp_mod); /* t1 = Y^4 */ @@ -95314,12 +95342,12 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i, static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t6 = t; + sp_digit* t1 = t + 2*16; + sp_digit* t2 = t + 4*16; + sp_digit* t3 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* U1 = X1*Z2^2 */ sp_1024_mont_sqr_avx2_16(t1, q->z, p1024_mod, p1024_mp_mod); @@ -95341,17 +95369,9 @@ static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r, sp_1024_proj_point_dbl_avx2_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t6; sp_digit* y = t1; sp_digit* z = t2; - int i; - - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); /* H = U2 - U1 */ sp_1024_mont_sub_avx2_16(t2, t2, t1, p1024_mod); @@ -95370,20 +95390,31 @@ static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r, sp_1024_mont_dbl_avx2_16(t3, y, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t3, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod); + sp_1024_mont_sub_avx2_16(y, y, x, p1024_mod); sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(y, y, t5, p1024_mod); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); + + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -95429,12 +95460,12 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 switch (ctx->state) { case 0: /* INIT */ - ctx->t1 = t; - ctx->t2 = t + 2*16; - ctx->t3 = t + 4*16; - ctx->t4 = t + 6*16; - ctx->t5 = t + 8*16; - ctx->t6 = t + 10*16; + ctx->t6 = t; + ctx->t1 = t + 2*16; + ctx->t2 = t + 4*16; + ctx->t3 = t + 6*16; + ctx->t4 = t + 8*16; + ctx->t5 = t + 10*16; ctx->x = ctx->t6; ctx->y = ctx->t1; ctx->z = ctx->t2; @@ -95541,7 +95572,7 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 21: /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ - sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); + sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod); ctx->state = 22; break; case 22: @@ -95554,22 +95585,28 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024 break; case 24: { - int i; - sp_digit maskp = 0 - (q->infinity & (!p->infinity)); - sp_digit maskq = 0 - (p->infinity & (!q->infinity)); - sp_digit maskt = ~(maskp | maskq); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (ctx->x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (ctx->y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (ctx->z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; ctx->state = 25; break; } @@ -95628,7 +95665,7 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r, /* A = 3*(X^2 - W) */ sp_1024_mont_sqr_avx2_16(t1, x, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t1, t1, w, p1024_mod); - sp_1024_mont_tpl_lower_avx2_16(a, t1, p1024_mod); + sp_1024_mont_tpl_avx2_16(a, t1, p1024_mod); /* B = X*Y^2 */ sp_1024_mont_sqr_avx2_16(t1, y, p1024_mod, p1024_mp_mod); sp_1024_mont_mul_avx2_16(b, t1, x, p1024_mod, p1024_mp_mod); @@ -95638,8 +95675,8 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r, sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod); sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod); /* B = 2.(B - X) */ - sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod); - sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod); + sp_1024_mont_sub_avx2_16(t2, b, x, p1024_mod); + sp_1024_mont_dbl_avx2_16(b, t2, p1024_mod); /* Z = Z*Y */ sp_1024_mont_mul_avx2_16(r[j].z, z, y, p1024_mod, p1024_mp_mod); z = r[j].z; @@ -95727,8 +95764,8 @@ static void sp_1024_proj_point_add_sub_avx2_16(sp_point_1024* ra, sp_1024_mont_sub_avx2_16(xs, xs, t1, p1024_mod); /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */ /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */ - sp_1024_mont_sub_lower_avx2_16(ys, ya, xs, p1024_mod); - sp_1024_mont_sub_lower_avx2_16(ya, ya, xa, p1024_mod); + sp_1024_mont_sub_avx2_16(ys, ya, xs, p1024_mod); + sp_1024_mont_sub_avx2_16(ya, ya, xa, p1024_mod); sp_1024_mont_mul_avx2_16(ya, ya, t4, p1024_mod, p1024_mp_mod); sp_1024_mont_sub_avx2_16(t6, p1024_mod, t6, p1024_mod); sp_1024_mont_mul_avx2_16(ys, ys, t6, p1024_mod, p1024_mp_mod); @@ -95777,7 +95814,7 @@ static int sp_1024_ecc_mulmod_win_add_sub_avx2_16(sp_point_1024* r, const sp_poi (void)heap; #ifdef WOLFSSL_SP_SMALL_STACK - t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * + t = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * (65+2), heap, DYNAMIC_TYPE_ECC); if (t == NULL) err = MEMORY_E; @@ -95907,12 +95944,12 @@ typedef struct sp_table_entry_1024 { static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*16; + sp_digit* t6 = t + 4*16; + sp_digit* t1 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -95928,13 +95965,9 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, sp_1024_proj_point_dbl_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod); @@ -95943,33 +95976,40 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_16(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_16(t1, t3, p1024_mod); - sp_1024_mont_sub_16(x, x, t1, p1024_mod); + sp_1024_mont_sqr_16(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_16(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_16(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_16(t5, t3, p1024_mod); + sp_1024_mont_sub_16(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_16(y, t3, t5, p1024_mod); + sp_1024_mont_mul_16(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_16(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -96364,12 +96404,12 @@ static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_1024* p, const sp_point_1024* q, sp_digit* t) { - sp_digit* t1 = t; - sp_digit* t2 = t + 2*16; - sp_digit* t3 = t + 4*16; - sp_digit* t4 = t + 6*16; - sp_digit* t5 = t + 8*16; - sp_digit* t6 = t + 10*16; + sp_digit* t2 = t; + sp_digit* t3 = t + 2*16; + sp_digit* t6 = t + 4*16; + sp_digit* t1 = t + 6*16; + sp_digit* t4 = t + 8*16; + sp_digit* t5 = t + 10*16; /* Calculate values to subtract from P->x and P->y. */ /* U2 = X2*Z1^2 */ @@ -96385,13 +96425,9 @@ static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, sp_1024_proj_point_dbl_avx2_16(r, p, t); } else { - sp_digit maskp; - sp_digit maskq; - sp_digit maskt; sp_digit* x = t2; - sp_digit* y = t5; + sp_digit* y = t3; sp_digit* z = t6; - int i; /* H = U2 - X1 */ sp_1024_mont_sub_avx2_16(t2, t2, p->x, p1024_mod); @@ -96400,33 +96436,40 @@ static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, /* Z3 = H*Z1 */ sp_1024_mont_mul_avx2_16(z, p->z, t2, p1024_mod, p1024_mp_mod); /* X3 = R^2 - H^3 - 2*X1*H^2 */ - sp_1024_mont_sqr_avx2_16(t1, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t3, p->x, t5, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_avx2_16(x, t1, t5, p1024_mod); - sp_1024_mont_dbl_avx2_16(t1, t3, p1024_mod); - sp_1024_mont_sub_avx2_16(x, x, t1, p1024_mod); + sp_1024_mont_sqr_avx2_16(t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t3, p->x, t1, p1024_mod, p1024_mp_mod); + sp_1024_mont_mul_avx2_16(t1, t1, t2, p1024_mod, p1024_mp_mod); + sp_1024_mont_sqr_avx2_16(t2, t4, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_avx2_16(t2, t2, t1, p1024_mod); + sp_1024_mont_dbl_avx2_16(t5, t3, p1024_mod); + sp_1024_mont_sub_avx2_16(x, t2, t5, p1024_mod); /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */ - sp_1024_mont_sub_lower_avx2_16(t3, t3, x, p1024_mod); + sp_1024_mont_sub_avx2_16(t3, t3, x, p1024_mod); sp_1024_mont_mul_avx2_16(t3, t3, t4, p1024_mod, p1024_mp_mod); - sp_1024_mont_mul_avx2_16(t5, t5, p->y, p1024_mod, p1024_mp_mod); - sp_1024_mont_sub_avx2_16(y, t3, t5, p1024_mod); + sp_1024_mont_mul_avx2_16(t1, t1, p->y, p1024_mod, p1024_mp_mod); + sp_1024_mont_sub_avx2_16(y, t3, t1, p1024_mod); + { + int i; + sp_digit maskp = 0 - (q->infinity & (!p->infinity)); + sp_digit maskq = 0 - (p->infinity & (!q->infinity)); + sp_digit maskt = ~(maskp | maskq); + sp_digit inf = (sp_digit)(p->infinity & q->infinity); - maskp = 0 - (q->infinity & (!p->infinity)); - maskq = 0 - (p->infinity & (!q->infinity)); - maskt = ~(maskp | maskq); - for (i = 0; i < 16; i++) { - r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt); + for (i = 0; i < 16; i++) { + r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | + (x[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | + (y[i] & maskt); + } + for (i = 0; i < 16; i++) { + r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | + (z[i] & maskt); + } + r->z[0] |= inf; + r->infinity = (word32)inf; } - for (i = 0; i < 16; i++) { - r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt); - } - for (i = 0; i < 16; i++) { - r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt); - } - r->z[0] |= p->infinity & q->infinity; - r->infinity = p->infinity & q->infinity; } } @@ -100239,7 +100282,7 @@ int sp_ecc_mulmod_base_add_1024(const mp_int* km, const ecc_point* am, #endif #ifdef WOLFSSL_SP_SMALL_STACK - point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, + point = (sp_point_1024*)XMALLOC(sizeof(sp_point_1024) * 2, heap, DYNAMIC_TYPE_ECC); if (point == NULL) err = MEMORY_E; diff --git a/wolfcrypt/src/sp_x86_64_asm.S b/wolfcrypt/src/sp_x86_64_asm.S index 18f36fe60..0fbacd68a 100644 --- a/wolfcrypt/src/sp_x86_64_asm.S +++ b/wolfcrypt/src/sp_x86_64_asm.S @@ -56741,52 +56741,6 @@ _sp_256_mont_sub_4: #ifndef __APPLE__ .size sp_256_mont_sub_4,.-sp_256_mont_sub_4 #endif /* __APPLE__ */ -/* Subtract two Montgomery form numbers (r = a - b % m). - * - * b is less than the modulus. - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_256_mont_sub_lower_4 -.type sp_256_mont_sub_lower_4,@function -.align 16 -sp_256_mont_sub_lower_4: -#else -.section __TEXT,__text -.globl _sp_256_mont_sub_lower_4 -.p2align 4 -_sp_256_mont_sub_lower_4: -#endif /* __APPLE__ */ - movq (%rsi), %rax - movq 8(%rsi), %rcx - movq 16(%rsi), %r8 - movq 24(%rsi), %r9 - subq (%rdx), %rax - movq $0xffffffff, %r10 - sbbq 8(%rdx), %rcx - movq $0xffffffff00000001, %r11 - sbbq 16(%rdx), %r8 - sbbq 24(%rdx), %r9 - sbbq %rsi, %rsi - andq %rsi, %r10 - andq %rsi, %r11 - addq %rsi, %rax - adcq %r10, %rcx - movq %rax, (%rdi) - adcq $0x00, %r8 - movq %rcx, 8(%rdi) - adcq %r11, %r9 - movq %r8, 16(%rdi) - movq %r9, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size sp_256_mont_sub_lower_4,.-sp_256_mont_sub_lower_4 -#endif /* __APPLE__ */ /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -56834,71 +56788,6 @@ _sp_256_div2_4: #ifndef __APPLE__ .size sp_256_div2_4,.-sp_256_div2_4 #endif /* __APPLE__ */ -/* Triple a Montgomery form number (r = a + a + a % m). - * - * a is less than m. - * - * r Result of Tripling. - * a Number to triple in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_256_mont_tpl_lower_4 -.type sp_256_mont_tpl_lower_4,@function -.align 16 -sp_256_mont_tpl_lower_4: -#else -.section __TEXT,__text -.globl _sp_256_mont_tpl_lower_4 -.p2align 4 -_sp_256_mont_tpl_lower_4: -#endif /* __APPLE__ */ - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - addq %rdx, %rdx - movq $0xffffffff, %r9 - adcq %rax, %rax - movq $0xffffffff00000001, %r10 - adcq %rcx, %rcx - adcq %r8, %r8 - sbbq %r11, %r11 - andq %r11, %r9 - andq %r11, %r10 - subq %r11, %rdx - sbbq %r9, %rax - sbbq $0x00, %rcx - sbbq %r10, %r8 - addq (%rsi), %rdx - movq $0xffffffff, %r9 - adcq 8(%rsi), %rax - movq $0xffffffff00000001, %r10 - adcq 16(%rsi), %rcx - adcq 24(%rsi), %r8 - sbbq %r11, %r11 - andq %r11, %r9 - andq %r11, %r10 - subq %r11, %rdx - sbbq %r9, %rax - sbbq $0x00, %rcx - sbbq %r10, %r8 - adcq $0x00, %r11 - andq %r11, %r9 - andq %r11, %r10 - subq %r11, %rdx - sbbq %r9, %rax - movq %rdx, (%rdi) - sbbq $0x00, %rcx - movq %rax, 8(%rdi) - sbbq %r10, %r8 - movq %rcx, 16(%rdi) - movq %r8, 24(%rdi) - repz retq -#ifndef __APPLE__ -.size sp_256_mont_tpl_lower_4,.-sp_256_mont_tpl_lower_4 -#endif /* __APPLE__ */ /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m). * * r Result of subtration. @@ -61241,68 +61130,6 @@ _sp_384_mont_sub_6: #ifndef __APPLE__ .size sp_384_mont_sub_6,.-sp_384_mont_sub_6 #endif /* __APPLE__ */ -/* Subtract two Montgomery form numbers (r = a - b % m). - * - * b is less than the modulus. - * - * r Result of subtration. - * a Number to subtract from in Montgomery form. - * b Number to subtract with in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_384_mont_sub_lower_6 -.type sp_384_mont_sub_lower_6,@function -.align 16 -sp_384_mont_sub_lower_6: -#else -.section __TEXT,__text -.globl _sp_384_mont_sub_lower_6 -.p2align 4 -_sp_384_mont_sub_lower_6: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - movq (%rsi), %rax - movq 8(%rsi), %rcx - movq 16(%rsi), %r8 - movq 24(%rsi), %r9 - movq 32(%rsi), %r10 - movq 40(%rsi), %r11 - subq (%rdx), %rax - movq $0xffffffff, %r12 - sbbq 8(%rdx), %rcx - movq $0xffffffff00000000, %r13 - sbbq 16(%rdx), %r8 - movq $0xfffffffffffffffe, %r14 - sbbq 24(%rdx), %r9 - sbbq 32(%rdx), %r10 - sbbq 40(%rdx), %r11 - sbbq %rsi, %rsi - andq %rsi, %r12 - andq %rsi, %r13 - andq %rsi, %r14 - addq %r12, %rax - adcq %r13, %rcx - movq %rax, (%rdi) - adcq %r14, %r8 - movq %rcx, 8(%rdi) - adcq %rsi, %r9 - movq %r8, 16(%rdi) - adcq %rsi, %r10 - movq %r9, 24(%rdi) - adcq %rsi, %r11 - movq %r10, 32(%rdi) - movq %r11, 40(%rdi) - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_384_mont_sub_lower_6,.-sp_384_mont_sub_lower_6 -#endif /* __APPLE__ */ /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) * * r Result of division by 2. @@ -61380,158 +61207,6 @@ _sp_384_div2_6: #ifndef __APPLE__ .size sp_384_div2_6,.-sp_384_div2_6 #endif /* __APPLE__ */ -/* Double a Montgomery form number (r = a + a % m). - * - * a is less than m. - * - * r Result of doubling. - * a Number to double in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_384_mont_dbl_lower_6 -.type sp_384_mont_dbl_lower_6,@function -.align 16 -sp_384_mont_dbl_lower_6: -#else -.section __TEXT,__text -.globl _sp_384_mont_dbl_lower_6 -.p2align 4 -_sp_384_mont_dbl_lower_6: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - movq 32(%rsi), %r9 - movq 40(%rsi), %r10 - addq %rdx, %rdx - movq $0xffffffff, %r11 - adcq %rax, %rax - movq $0xffffffff00000000, %r12 - adcq %rcx, %rcx - movq $0xfffffffffffffffe, %r13 - adcq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - sbbq %r14, %r14 - andq %r14, %r11 - andq %r14, %r12 - andq %r14, %r13 - subq %r11, %rdx - sbbq %r12, %rax - movq %rdx, (%rdi) - sbbq %r13, %rcx - movq %rax, 8(%rdi) - sbbq %r14, %r8 - movq %rcx, 16(%rdi) - sbbq %r14, %r9 - movq %r8, 24(%rdi) - sbbq %r14, %r10 - movq %r9, 32(%rdi) - movq %r10, 40(%rdi) - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_384_mont_dbl_lower_6,.-sp_384_mont_dbl_lower_6 -#endif /* __APPLE__ */ -/* Double a Montgomery form number (r = a + a % m). - * - * a is less than m. - * - * r Result of doubling. - * a Number to double in Montgomery form. - * m Modulus (prime). - */ -#ifndef __APPLE__ -.text -.globl sp_384_mont_tpl_lower_6 -.type sp_384_mont_tpl_lower_6,@function -.align 16 -sp_384_mont_tpl_lower_6: -#else -.section __TEXT,__text -.globl _sp_384_mont_tpl_lower_6 -.p2align 4 -_sp_384_mont_tpl_lower_6: -#endif /* __APPLE__ */ - pushq %r12 - pushq %r13 - pushq %r14 - movq (%rsi), %rdx - movq 8(%rsi), %rax - movq 16(%rsi), %rcx - movq 24(%rsi), %r8 - movq 32(%rsi), %r9 - movq 40(%rsi), %r10 - addq %rdx, %rdx - movq $0xffffffff, %r11 - adcq %rax, %rax - movq $0xffffffff00000000, %r12 - adcq %rcx, %rcx - movq $0xfffffffffffffffe, %r13 - adcq %r8, %r8 - adcq %r9, %r9 - adcq %r10, %r10 - sbbq %r14, %r14 - andq %r14, %r11 - andq %r14, %r12 - andq %r14, %r13 - subq %r11, %rdx - sbbq %r12, %rax - movq %rdx, (%rdi) - sbbq %r13, %rcx - sbbq %r14, %r8 - sbbq %r14, %r9 - sbbq %r14, %r10 - addq (%rsi), %rdx - movq $0xffffffff, %r11 - adcq 8(%rsi), %rax - movq $0xffffffff00000000, %r12 - adcq 16(%rsi), %rcx - movq $0xfffffffffffffffe, %r13 - adcq 24(%rsi), %r8 - adcq 32(%rsi), %r9 - adcq 40(%rsi), %r10 - sbbq %r14, %r14 - andq %r14, %r11 - andq %r14, %r12 - andq %r14, %r13 - subq %r11, %rdx - sbbq %r12, %rax - sbbq %r13, %rcx - sbbq %r14, %r8 - sbbq %r14, %r9 - sbbq %r14, %r10 - adcq $0x00, %r14 - andq %r14, %r11 - andq %r14, %r12 - andq %r14, %r13 - subq %r11, %rdx - sbbq %r12, %rax - movq %rdx, (%rdi) - sbbq %r13, %rcx - movq %rax, 8(%rdi) - sbbq %r14, %r8 - movq %rcx, 16(%rdi) - sbbq %r14, %r9 - movq %r8, 24(%rdi) - sbbq %r14, %r10 - movq %r9, 32(%rdi) - movq %r10, 40(%rdi) - popq %r14 - popq %r13 - popq %r12 - repz retq -#ifndef __APPLE__ -.size sp_384_mont_tpl_lower_6,.-sp_384_mont_tpl_lower_6 -#endif /* __APPLE__ */ #ifndef WC_NO_CACHE_RESISTANT /* Touch each possible point that could be being copied. * diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index fcf4ce193..1f42b2755 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -55582,45 +55582,6 @@ sp_256_mont_sub_4 PROC ret sp_256_mont_sub_4 ENDP _text ENDS -; /* Subtract two Montgomery form numbers (r = a - b % m). -; * -; * b is less than the modulus. -; * -; * r Result of subtration. -; * a Number to subtract from in Montgomery form. -; * b Number to subtract with in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_256_mont_sub_lower_4 PROC - push r12 - push r13 - mov rax, QWORD PTR [rdx] - mov r9, QWORD PTR [rdx+8] - mov r10, QWORD PTR [rdx+16] - mov r11, QWORD PTR [rdx+24] - sub rax, QWORD PTR [r8] - mov r12, 4294967295 - sbb r9, QWORD PTR [r8+8] - mov r13, 18446744069414584321 - sbb r10, QWORD PTR [r8+16] - sbb r11, QWORD PTR [r8+24] - sbb rdx, rdx - and r12, rdx - and r13, rdx - add rax, rdx - adc r9, r12 - mov QWORD PTR [rcx], rax - adc r10, 0 - mov QWORD PTR [rcx+8], r9 - adc r11, r13 - mov QWORD PTR [rcx+16], r10 - mov QWORD PTR [rcx+24], r11 - pop r13 - pop r12 - ret -sp_256_mont_sub_lower_4 ENDP -_text ENDS ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) ; * ; * r Result of division by 2. @@ -55661,64 +55622,6 @@ sp_256_div2_4 PROC ret sp_256_div2_4 ENDP _text ENDS -; /* Triple a Montgomery form number (r = a + a + a % m). -; * -; * a is less than m. -; * -; * r Result of Tripling. -; * a Number to triple in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_256_mont_tpl_lower_4 PROC - push r12 - push r13 - mov rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rdx+8] - mov r9, QWORD PTR [rdx+16] - mov r10, QWORD PTR [rdx+24] - add rax, rax - mov r11, 4294967295 - adc r8, r8 - mov r12, 18446744069414584321 - adc r9, r9 - adc r10, r10 - sbb r13, r13 - and r11, r13 - and r12, r13 - sub rax, r13 - sbb r8, r11 - sbb r9, 0 - sbb r10, r12 - add rax, QWORD PTR [rdx] - mov r11, 4294967295 - adc r8, QWORD PTR [rdx+8] - mov r12, 18446744069414584321 - adc r9, QWORD PTR [rdx+16] - adc r10, QWORD PTR [rdx+24] - sbb r13, r13 - and r11, r13 - and r12, r13 - sub rax, r13 - sbb r8, r11 - sbb r9, 0 - sbb r10, r12 - adc r13, 0 - and r11, r13 - and r12, r13 - sub rax, r13 - sbb r8, r11 - mov QWORD PTR [rcx], rax - sbb r9, 0 - mov QWORD PTR [rcx+8], r8 - sbb r10, r12 - mov QWORD PTR [rcx+16], r9 - mov QWORD PTR [rcx+24], r10 - pop r13 - pop r12 - ret -sp_256_mont_tpl_lower_4 ENDP -_text ENDS ; /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m). ; * ; * r Result of subtration. @@ -59792,61 +59695,6 @@ sp_384_mont_sub_6 PROC ret sp_384_mont_sub_6 ENDP _text ENDS -; /* Subtract two Montgomery form numbers (r = a - b % m). -; * -; * b is less than the modulus. -; * -; * r Result of subtration. -; * a Number to subtract from in Montgomery form. -; * b Number to subtract with in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_384_mont_sub_lower_6 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - mov rax, QWORD PTR [rdx] - mov r9, QWORD PTR [rdx+8] - mov r10, QWORD PTR [rdx+16] - mov r11, QWORD PTR [rdx+24] - mov r12, QWORD PTR [rdx+32] - mov r13, QWORD PTR [rdx+40] - sub rax, QWORD PTR [r8] - mov r14, 4294967295 - sbb r9, QWORD PTR [r8+8] - mov r15, 18446744069414584320 - sbb r10, QWORD PTR [r8+16] - mov rdi, 18446744073709551614 - sbb r11, QWORD PTR [r8+24] - sbb r12, QWORD PTR [r8+32] - sbb r13, QWORD PTR [r8+40] - sbb rdx, rdx - and r14, rdx - and r15, rdx - and rdi, rdx - add rax, r14 - adc r9, r15 - mov QWORD PTR [rcx], rax - adc r10, rdi - mov QWORD PTR [rcx+8], r9 - adc r11, rdx - mov QWORD PTR [rcx+16], r10 - adc r12, rdx - mov QWORD PTR [rcx+24], r11 - adc r13, rdx - mov QWORD PTR [rcx+32], r12 - mov QWORD PTR [rcx+40], r13 - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_384_mont_sub_lower_6 ENDP -_text ENDS ; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) ; * ; * r Result of division by 2. @@ -59917,144 +59765,6 @@ sp_384_div2_6 PROC ret sp_384_div2_6 ENDP _text ENDS -; /* Double a Montgomery form number (r = a + a % m). -; * -; * a is less than m. -; * -; * r Result of doubling. -; * a Number to double in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_384_mont_dbl_lower_6 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - mov rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rdx+8] - mov r9, QWORD PTR [rdx+16] - mov r10, QWORD PTR [rdx+24] - mov r11, QWORD PTR [rdx+32] - mov r12, QWORD PTR [rdx+40] - add rax, rax - mov r13, 4294967295 - adc r8, r8 - mov r14, 18446744069414584320 - adc r9, r9 - mov r15, 18446744073709551614 - adc r10, r10 - adc r11, r11 - adc r12, r12 - sbb rdi, rdi - and r13, rdi - and r14, rdi - and r15, rdi - sub rax, r13 - sbb r8, r14 - mov QWORD PTR [rcx], rax - sbb r9, r15 - mov QWORD PTR [rcx+8], r8 - sbb r10, rdi - mov QWORD PTR [rcx+16], r9 - sbb r11, rdi - mov QWORD PTR [rcx+24], r10 - sbb r12, rdi - mov QWORD PTR [rcx+32], r11 - mov QWORD PTR [rcx+40], r12 - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_384_mont_dbl_lower_6 ENDP -_text ENDS -; /* Double a Montgomery form number (r = a + a % m). -; * -; * a is less than m. -; * -; * r Result of doubling. -; * a Number to double in Montgomery form. -; * m Modulus (prime). -; */ -_text SEGMENT READONLY PARA -sp_384_mont_tpl_lower_6 PROC - push r12 - push r13 - push r14 - push r15 - push rdi - mov rax, QWORD PTR [rdx] - mov r8, QWORD PTR [rdx+8] - mov r9, QWORD PTR [rdx+16] - mov r10, QWORD PTR [rdx+24] - mov r11, QWORD PTR [rdx+32] - mov r12, QWORD PTR [rdx+40] - add rax, rax - mov r13, 4294967295 - adc r8, r8 - mov r14, 18446744069414584320 - adc r9, r9 - mov r15, 18446744073709551614 - adc r10, r10 - adc r11, r11 - adc r12, r12 - sbb rdi, rdi - and r13, rdi - and r14, rdi - and r15, rdi - sub rax, r13 - sbb r8, r14 - mov QWORD PTR [rcx], rax - sbb r9, r15 - sbb r10, rdi - sbb r11, rdi - sbb r12, rdi - add rax, QWORD PTR [rdx] - mov r13, 4294967295 - adc r8, QWORD PTR [rdx+8] - mov r14, 18446744069414584320 - adc r9, QWORD PTR [rdx+16] - mov r15, 18446744073709551614 - adc r10, QWORD PTR [rdx+24] - adc r11, QWORD PTR [rdx+32] - adc r12, QWORD PTR [rdx+40] - sbb rdi, rdi - and r13, rdi - and r14, rdi - and r15, rdi - sub rax, r13 - sbb r8, r14 - sbb r9, r15 - sbb r10, rdi - sbb r11, rdi - sbb r12, rdi - adc rdi, 0 - and r13, rdi - and r14, rdi - and r15, rdi - sub rax, r13 - sbb r8, r14 - mov QWORD PTR [rcx], rax - sbb r9, r15 - mov QWORD PTR [rcx+8], r8 - sbb r10, rdi - mov QWORD PTR [rcx+16], r9 - sbb r11, rdi - mov QWORD PTR [rcx+24], r10 - sbb r12, rdi - mov QWORD PTR [rcx+32], r11 - mov QWORD PTR [rcx+40], r12 - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret -sp_384_mont_tpl_lower_6 ENDP -_text ENDS IFNDEF WC_NO_CACHE_RESISTANT ; /* Touch each possible point that could be being copied. ; *