From 9ac3083e5d7a4691e9eb6305b1b076e778252155 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 20 Nov 2023 13:05:58 +1000 Subject: [PATCH] Thumb2 ASM fixes Make a separate AES for IAR that has AES_encrypt_block and AES_decrypt_block inlined. Default code is relying on compiler to use specific registers and not modify others. Improve performance of small SP ASM code for RSA. --- wolfcrypt/src/port/arm/thumb2-aes-asm.S | 216 ++- wolfcrypt/src/port/arm/thumb2-aes-asm_c.c | 357 ++-- wolfcrypt/src/port/arm/thumb2-sha256-asm.S | 8 +- wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c | 19 +- wolfcrypt/src/port/arm/thumb2-sha512-asm.S | 8 +- wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c | 19 +- wolfcrypt/src/sp_arm32.c | 1692 +++++++++++++++--- wolfcrypt/src/sp_cortexm.c | 1021 ++++++----- 8 files changed, 2428 insertions(+), 912 deletions(-) diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S index fb0888a2a..0badf8f97 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S @@ -670,10 +670,10 @@ L_AES_invert_key_mix_loop: EOR r8, r8, r9, ROR #24 STR r8, [r0], #4 SUBS r11, r11, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_invert_key_mix_loop #else - BNE.N L_AES_invert_key_mix_loop + BNE.W L_AES_invert_key_mix_loop #endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 165 */ @@ -703,16 +703,16 @@ AES_set_encrypt_key: LDR r10, L_AES_Thumb2_te ADR lr, L_AES_Thumb2_rcon CMP r1, #0x80 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_set_encrypt_key_start_128 #else - BEQ.N L_AES_set_encrypt_key_start_128 + BEQ.W L_AES_set_encrypt_key_start_128 #endif CMP r1, #0xc0 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_set_encrypt_key_start_192 #else - BEQ.N L_AES_set_encrypt_key_start_192 + BEQ.W L_AES_set_encrypt_key_start_192 #endif LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] @@ -799,7 +799,11 @@ L_AES_set_encrypt_key_loop_256: ADD r2, r2, #0x10 STM r2, {r4, r5, r6, r7} SUB r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif L_AES_set_encrypt_key_start_192: LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] @@ -861,7 +865,11 @@ L_AES_set_encrypt_key_loop_192: EOR r6, r6, r5 EOR r7, r7, r6 STM r2, {r4, r5, r6, r7} +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif L_AES_set_encrypt_key_start_128: LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] @@ -1009,10 +1017,10 @@ L_AES_encrypt_block_nr: EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_encrypt_block_nr #else - BNE.N L_AES_encrypt_block_nr + BNE.W L_AES_encrypt_block_nr #endif UBFX r8, r5, #16, #8 LSR r11, r4, #24 @@ -1137,16 +1145,16 @@ AES_ECB_encrypt: LDR r12, [sp, #36] PUSH {r3} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_encrypt_start_block_128 #else - BEQ.N L_AES_ECB_encrypt_start_block_128 + BEQ.W L_AES_ECB_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_encrypt_start_block_192 #else - BEQ.N L_AES_ECB_encrypt_start_block_192 + BEQ.W L_AES_ECB_encrypt_start_block_192 #endif L_AES_ECB_encrypt_loop_block_256: LDR r4, [lr] @@ -1179,12 +1187,16 @@ L_AES_ECB_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_256 #else - BNE.N L_AES_ECB_encrypt_loop_block_256 + BNE.W L_AES_ECB_encrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif L_AES_ECB_encrypt_start_block_192: L_AES_ECB_encrypt_loop_block_192: LDR r4, [lr] @@ -1217,12 +1229,16 @@ L_AES_ECB_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_192 #else - BNE.N L_AES_ECB_encrypt_loop_block_192 + BNE.W L_AES_ECB_encrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif L_AES_ECB_encrypt_start_block_128: L_AES_ECB_encrypt_loop_block_128: LDR r4, [lr] @@ -1255,10 +1271,10 @@ L_AES_ECB_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_128 #else - BNE.N L_AES_ECB_encrypt_loop_block_128 + BNE.W L_AES_ECB_encrypt_loop_block_128 #endif L_AES_ECB_encrypt_end: POP {r3} @@ -1280,16 +1296,16 @@ AES_CBC_encrypt: LDM r9, {r4, r5, r6, r7} PUSH {r3, r9} CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_encrypt_start_block_128 #else - BEQ.N L_AES_CBC_encrypt_start_block_128 + BEQ.W L_AES_CBC_encrypt_start_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_encrypt_start_block_192 #else - BEQ.N L_AES_CBC_encrypt_start_block_192 + BEQ.W L_AES_CBC_encrypt_start_block_192 #endif L_AES_CBC_encrypt_loop_block_256: LDR r8, [lr] @@ -1326,12 +1342,16 @@ L_AES_CBC_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_256 #else - BNE.N L_AES_CBC_encrypt_loop_block_256 + BNE.W L_AES_CBC_encrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif L_AES_CBC_encrypt_start_block_192: L_AES_CBC_encrypt_loop_block_192: LDR r8, [lr] @@ -1368,12 +1388,16 @@ L_AES_CBC_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_192 #else - BNE.N L_AES_CBC_encrypt_loop_block_192 + BNE.W L_AES_CBC_encrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif L_AES_CBC_encrypt_start_block_128: L_AES_CBC_encrypt_loop_block_128: LDR r8, [lr] @@ -1410,10 +1434,10 @@ L_AES_CBC_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_128 #else - BNE.N L_AES_CBC_encrypt_loop_block_128 + BNE.W L_AES_CBC_encrypt_loop_block_128 #endif L_AES_CBC_encrypt_end: POP {r3, r9} @@ -1441,16 +1465,16 @@ AES_CTR_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CTR_encrypt_start_block_128 #else - BEQ.N L_AES_CTR_encrypt_start_block_128 + BEQ.W L_AES_CTR_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CTR_encrypt_start_block_192 #else - BEQ.N L_AES_CTR_encrypt_start_block_192 + BEQ.W L_AES_CTR_encrypt_start_block_192 #endif L_AES_CTR_encrypt_loop_block_256: PUSH {r1, r2, lr} @@ -1491,12 +1515,16 @@ L_AES_CTR_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_256 #else - BNE.N L_AES_CTR_encrypt_loop_block_256 + BNE.W L_AES_CTR_encrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif L_AES_CTR_encrypt_start_block_192: L_AES_CTR_encrypt_loop_block_192: PUSH {r1, r2, lr} @@ -1537,12 +1565,16 @@ L_AES_CTR_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_192 #else - BNE.N L_AES_CTR_encrypt_loop_block_192 + BNE.W L_AES_CTR_encrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif L_AES_CTR_encrypt_start_block_128: L_AES_CTR_encrypt_loop_block_128: PUSH {r1, r2, lr} @@ -1583,10 +1615,10 @@ L_AES_CTR_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_128 #else - BNE.N L_AES_CTR_encrypt_loop_block_128 + BNE.W L_AES_CTR_encrypt_loop_block_128 #endif L_AES_CTR_encrypt_end: POP {r3, r8} @@ -1709,10 +1741,10 @@ L_AES_decrypt_block_nr: EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_decrypt_block_nr #else - BNE.N L_AES_decrypt_block_nr + BNE.W L_AES_decrypt_block_nr #endif UBFX r8, r7, #16, #8 LSR r11, r4, #24 @@ -2097,16 +2129,16 @@ AES_ECB_decrypt: MOV r12, r2 ADR r2, L_AES_Thumb2_td4 CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_decrypt_start_block_128 #else - BEQ.N L_AES_ECB_decrypt_start_block_128 + BEQ.W L_AES_ECB_decrypt_start_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_decrypt_start_block_192 #else - BEQ.N L_AES_ECB_decrypt_start_block_192 + BEQ.W L_AES_ECB_decrypt_start_block_192 #endif L_AES_ECB_decrypt_loop_block_256: LDR r4, [lr] @@ -2138,12 +2170,16 @@ L_AES_ECB_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_256 #else - BNE.N L_AES_ECB_decrypt_loop_block_256 + BNE.W L_AES_ECB_decrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif L_AES_ECB_decrypt_start_block_192: L_AES_ECB_decrypt_loop_block_192: LDR r4, [lr] @@ -2175,12 +2211,16 @@ L_AES_ECB_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_192 #else - BNE.N L_AES_ECB_decrypt_loop_block_192 + BNE.W L_AES_ECB_decrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif L_AES_ECB_decrypt_start_block_128: L_AES_ECB_decrypt_loop_block_128: LDR r4, [lr] @@ -2212,10 +2252,10 @@ L_AES_ECB_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_128 #else - BNE.N L_AES_ECB_decrypt_loop_block_128 + BNE.W L_AES_ECB_decrypt_loop_block_128 #endif L_AES_ECB_decrypt_end: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -2237,16 +2277,16 @@ AES_CBC_decrypt: ADR r2, L_AES_Thumb2_td4 PUSH {r3, r4} CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_loop_block_128 #else - BEQ.N L_AES_CBC_decrypt_loop_block_128 + BEQ.W L_AES_CBC_decrypt_loop_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_loop_block_192 #else - BEQ.N L_AES_CBC_decrypt_loop_block_192 + BEQ.W L_AES_CBC_decrypt_loop_block_192 #endif L_AES_CBC_decrypt_loop_block_256: PUSH {r1, r12, lr} @@ -2288,10 +2328,10 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2333,12 +2373,16 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_256 #else - BNE.N L_AES_CBC_decrypt_loop_block_256 + BNE.W L_AES_CBC_decrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_loop_block_192: PUSH {r1, r12, lr} LDR r4, [lr] @@ -2379,10 +2423,10 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2424,12 +2468,16 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_192 #else - BNE.N L_AES_CBC_decrypt_loop_block_192 + BNE.W L_AES_CBC_decrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_loop_block_128: PUSH {r1, r12, lr} LDR r4, [lr] @@ -2470,10 +2518,10 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2515,12 +2563,16 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_128 #else - BNE.N L_AES_CBC_decrypt_loop_block_128 + BNE.W L_AES_CBC_decrypt_loop_block_128 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_decrypt_end +#else + B.N L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_end_odd: LDR r4, [sp, #4] LDRD r8, r9, [r4, #16] @@ -3109,10 +3161,10 @@ L_GCM_gmult_len_start_block: POP {r3} SUBS r3, r3, #0x10 ADD r2, r2, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_GCM_gmult_len_start_block #else - BNE.N L_GCM_gmult_len_start_block + BNE.W L_GCM_gmult_len_start_block #endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 742 */ @@ -3141,16 +3193,16 @@ AES_GCM_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_GCM_encrypt_start_block_128 #else - BEQ.N L_AES_GCM_encrypt_start_block_128 + BEQ.W L_AES_GCM_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_GCM_encrypt_start_block_192 #else - BEQ.N L_AES_GCM_encrypt_start_block_192 + BEQ.W L_AES_GCM_encrypt_start_block_192 #endif L_AES_GCM_encrypt_loop_block_256: PUSH {r1, r2, lr} @@ -3188,12 +3240,16 @@ L_AES_GCM_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_256 #else - BNE.N L_AES_GCM_encrypt_loop_block_256 + BNE.W L_AES_GCM_encrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif L_AES_GCM_encrypt_start_block_192: L_AES_GCM_encrypt_loop_block_192: PUSH {r1, r2, lr} @@ -3231,12 +3287,16 @@ L_AES_GCM_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_192 #else - BNE.N L_AES_GCM_encrypt_loop_block_192 + BNE.W L_AES_GCM_encrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif L_AES_GCM_encrypt_start_block_128: L_AES_GCM_encrypt_loop_block_128: PUSH {r1, r2, lr} @@ -3274,10 +3334,10 @@ L_AES_GCM_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_128 #else - BNE.N L_AES_GCM_encrypt_loop_block_128 + BNE.W L_AES_GCM_encrypt_loop_block_128 #endif L_AES_GCM_encrypt_end: POP {r3, r8} diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index f6bad8bd2..7d5357f1a 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -301,18 +301,19 @@ void AES_invert_key(unsigned char* ks, word32 rounds) "EOR r8, r8, r9, ROR #24\n\t" "STR r8, [%[ks]], #4\n\t" "SUBS r11, r11, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_invert_key_mix_loop\n\t" #else - "BNE.N L_AES_invert_key_mix_loop\n\t" + "BNE.W L_AES_invert_key_mix_loop\n\t" #endif - : [ks] "+r" (ks), [rounds] "+r" (rounds), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) -#else - [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [ks] "+r" (ks), [rounds] "+r" (rounds), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) : +#else + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -321,7 +322,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds) static const uint32_t L_AES_Thumb2_rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000, + 0x1b000000, 0x36000000 }; void AES_set_encrypt_key(const unsigned char* key, word32 len, @@ -344,16 +345,16 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "MOV r10, %[L_AES_Thumb2_te]\n\t" "MOV lr, %[L_AES_Thumb2_rcon]\n\t" "CMP %[len], #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_set_encrypt_key_start_128\n\t" #else - "BEQ.N L_AES_set_encrypt_key_start_128\n\t" + "BEQ.W L_AES_set_encrypt_key_start_128\n\t" #endif "CMP %[len], #0xc0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_set_encrypt_key_start_192\n\t" #else - "BEQ.N L_AES_set_encrypt_key_start_192\n\t" + "BEQ.W L_AES_set_encrypt_key_start_192\n\t" #endif "LDRD r4, r5, [%[key]]\n\t" "LDRD r6, r7, [%[key], #8]\n\t" @@ -441,7 +442,11 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "ADD %[ks], %[ks], #0x10\n\t" "STM %[ks], {r4, r5, r6, r7}\n\t" "SUB %[ks], %[ks], #0x10\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end\n\t" +#endif "\n" "L_AES_set_encrypt_key_start_192:\n\t" "LDRD r4, r5, [%[key]]\n\t" @@ -505,7 +510,11 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "EOR r6, r6, r5\n\t" "EOR r7, r7, r6\n\t" "STM %[ks], {r4, r5, r6, r7}\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end\n\t" +#endif "\n" "L_AES_set_encrypt_key_start_128:\n\t" "LDRD r4, r5, [%[key]]\n\t" @@ -545,13 +554,14 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks #endif "\n" "L_AES_set_encrypt_key_end:\n\t" - : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) -#else - [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) : +#else + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10" ); } @@ -674,10 +684,10 @@ void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks) "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_encrypt_block_nr\n\t" #else - "BNE.N L_AES_encrypt_block_nr\n\t" + "BNE.W L_AES_encrypt_block_nr\n\t" #endif "UBFX r8, r5, #16, #8\n\t" "LSR r11, r4, #24\n\t" @@ -809,19 +819,23 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long __asm__ __volatile__ ( "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "PUSH {%[ks]}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_ECB_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_ECB_encrypt_start_block_128\n\t" + "BEQ.W L_AES_ECB_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_ECB_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_ECB_encrypt_start_block_192\n\t" + "BEQ.W L_AES_ECB_encrypt_start_block_192\n\t" #endif "\n" "L_AES_ECB_encrypt_loop_block_256:\n\t" @@ -855,12 +869,16 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_256\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_256\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end\n\t" +#endif "\n" "L_AES_ECB_encrypt_start_block_192:\n\t" "\n" @@ -895,12 +913,16 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_192\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_192\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end\n\t" +#endif "\n" "L_AES_ECB_encrypt_start_block_128:\n\t" "\n" @@ -935,21 +957,22 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_128\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_128\n\t" #endif "\n" "L_AES_ECB_encrypt_end:\n\t" "POP {%[ks]}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) -#else - [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -975,23 +998,31 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r9, r5\n\t" +#else + "LDR r9, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" "LDM r9, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r9}\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_CBC_encrypt_start_block_128\n\t" + "BEQ.W L_AES_CBC_encrypt_start_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_CBC_encrypt_start_block_192\n\t" + "BEQ.W L_AES_CBC_encrypt_start_block_192\n\t" #endif "\n" "L_AES_CBC_encrypt_loop_block_256:\n\t" @@ -1029,12 +1060,16 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_256\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_256\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end\n\t" +#endif "\n" "L_AES_CBC_encrypt_start_block_192:\n\t" "\n" @@ -1073,12 +1108,16 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_192\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_192\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end\n\t" +#endif "\n" "L_AES_CBC_encrypt_start_block_128:\n\t" "\n" @@ -1117,22 +1156,23 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_128\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_128\n\t" #endif "\n" "L_AES_CBC_encrypt_end:\n\t" "POP {%[ks], r9}\n\t" "STM r9, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) -#else - [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1158,8 +1198,16 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" "LDM r8, {r4, r5, r6, r7}\n\t" @@ -1170,16 +1218,16 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CTR_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_CTR_encrypt_start_block_128\n\t" + "BEQ.W L_AES_CTR_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CTR_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_CTR_encrypt_start_block_192\n\t" + "BEQ.W L_AES_CTR_encrypt_start_block_192\n\t" #endif "\n" "L_AES_CTR_encrypt_loop_block_256:\n\t" @@ -1221,12 +1269,16 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CTR_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_256\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_256\n\t" #endif +#ifdef __GNUC__ "B L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end\n\t" +#endif "\n" "L_AES_CTR_encrypt_start_block_192:\n\t" "\n" @@ -1269,12 +1321,16 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CTR_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_192\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_192\n\t" #endif +#ifdef __GNUC__ "B L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end\n\t" +#endif "\n" "L_AES_CTR_encrypt_start_block_128:\n\t" "\n" @@ -1317,10 +1373,10 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CTR_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_128\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_128\n\t" #endif "\n" "L_AES_CTR_encrypt_end:\n\t" @@ -1330,13 +1386,14 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "REV r6, r6\n\t" "REV r7, r7\n\t" "STM r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) -#else - [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1461,10 +1518,10 @@ void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4) "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_decrypt_block_nr\n\t" #else - "BNE.N L_AES_decrypt_block_nr\n\t" + "BNE.W L_AES_decrypt_block_nr\n\t" #endif "UBFX r8, r7, #16, #8\n\t" "LSR r11, r4, #24\n\t" @@ -1628,22 +1685,26 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" "MOV r12, %[len]\n\t" "MOV r2, %[L_AES_Thumb2_td4]\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_ECB_decrypt_start_block_128\n\t" #else - "BEQ.N L_AES_ECB_decrypt_start_block_128\n\t" + "BEQ.W L_AES_ECB_decrypt_start_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_ECB_decrypt_start_block_192\n\t" #else - "BEQ.N L_AES_ECB_decrypt_start_block_192\n\t" + "BEQ.W L_AES_ECB_decrypt_start_block_192\n\t" #endif "\n" "L_AES_ECB_decrypt_loop_block_256:\n\t" @@ -1676,12 +1737,16 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_decrypt_loop_block_256\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_256\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_256\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end\n\t" +#endif "\n" "L_AES_ECB_decrypt_start_block_192:\n\t" "\n" @@ -1715,12 +1780,16 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_decrypt_loop_block_192\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_192\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_192\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end\n\t" +#endif "\n" "L_AES_ECB_decrypt_start_block_128:\n\t" "\n" @@ -1754,20 +1823,21 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_ECB_decrypt_loop_block_128\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_128\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_128\n\t" #endif "\n" "L_AES_ECB_decrypt_end:\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) -#else - [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1794,24 +1864,32 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r4, r5\n\t" +#else + "LDR r4, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" "MOV r12, %[len]\n\t" "MOV r2, %[L_AES_Thumb2_td4]\n\t" "PUSH {%[ks], r4}\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_decrypt_loop_block_128\n\t" #else - "BEQ.N L_AES_CBC_decrypt_loop_block_128\n\t" + "BEQ.W L_AES_CBC_decrypt_loop_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_decrypt_loop_block_192\n\t" #else - "BEQ.N L_AES_CBC_decrypt_loop_block_192\n\t" + "BEQ.W L_AES_CBC_decrypt_loop_block_192\n\t" #endif "\n" "L_AES_CBC_decrypt_loop_block_256:\n\t" @@ -1854,10 +1932,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -1899,12 +1977,16 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_decrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_256\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_256\n\t" #endif +#ifdef __GNUC__ "B L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end\n\t" +#endif "\n" "L_AES_CBC_decrypt_loop_block_192:\n\t" "PUSH {r1, r12, lr}\n\t" @@ -1946,10 +2028,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -1991,12 +2073,16 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_decrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_192\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_192\n\t" #endif +#ifdef __GNUC__ "B L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end\n\t" +#endif "\n" "L_AES_CBC_decrypt_loop_block_128:\n\t" "PUSH {r1, r12, lr}\n\t" @@ -2038,10 +2124,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -2083,12 +2169,16 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_CBC_decrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_128\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_128\n\t" #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_AES_CBC_decrypt_end\n\t" +#else + "B.N L_AES_CBC_decrypt_end\n\t" +#endif "\n" "L_AES_CBC_decrypt_end_odd:\n\t" "LDR r4, [sp, #4]\n\t" @@ -2099,13 +2189,14 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "\n" "L_AES_CBC_decrypt_end:\n\t" "POP {%[ks], r4}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) -#else - [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r8", "r9", "r10", "r11" ); } @@ -2685,18 +2776,19 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned cha "POP {r3}\n\t" "SUBS %[len], %[len], #0x10\n\t" "ADD %[data], %[data], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_GCM_gmult_len_start_block\n\t" #else - "BNE.N L_GCM_gmult_len_start_block\n\t" + "BNE.W L_GCM_gmult_len_start_block\n\t" #endif - : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) -#else - [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), + [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) : +#else + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len) + : [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -2721,8 +2813,16 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_gcm]\n\t" "LDM r8, {r4, r5, r6, r7}\n\t" @@ -2733,16 +2833,16 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_GCM_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_GCM_encrypt_start_block_128\n\t" + "BEQ.W L_AES_GCM_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BEQ L_AES_GCM_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_GCM_encrypt_start_block_192\n\t" + "BEQ.W L_AES_GCM_encrypt_start_block_192\n\t" #endif "\n" "L_AES_GCM_encrypt_loop_block_256:\n\t" @@ -2781,12 +2881,16 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_GCM_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_256\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_256\n\t" #endif +#ifdef __GNUC__ "B L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end\n\t" +#endif "\n" "L_AES_GCM_encrypt_start_block_192:\n\t" "\n" @@ -2826,12 +2930,16 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_GCM_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_192\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_192\n\t" #endif +#ifdef __GNUC__ "B L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end\n\t" +#endif "\n" "L_AES_GCM_encrypt_start_block_128:\n\t" "\n" @@ -2871,10 +2979,10 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_AES_GCM_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_128\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_128\n\t" #endif "\n" "L_AES_GCM_encrypt_end:\n\t" @@ -2884,13 +2992,14 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "REV r6, r6\n\t" "REV r7, r7\n\t" "STM r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c) -#else - [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S index 5179e60e2..30d8dc76b 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -925,10 +925,10 @@ L_SHA256_transform_len_start: STR r9, [sp, #60] ADD r3, r3, #0x40 SUBS r12, r12, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA256_transform_len_start #else - BNE.N L_SHA256_transform_len_start + BNE.W L_SHA256_transform_len_start #endif /* Round 0 */ LDR r5, [r0, #16] @@ -1470,10 +1470,10 @@ L_SHA256_transform_len_start: SUBS r2, r2, #0x40 SUB r3, r3, #0xc0 ADD r1, r1, #0x40 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA256_transform_len_begin #else - BNE.N L_SHA256_transform_len_begin + BNE.W L_SHA256_transform_len_begin #endif ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c index 9a39382df..a2367c2a2 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -904,10 +904,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "STR r9, [sp, #60]\n\t" "ADD r3, r3, #0x40\n\t" "SUBS r12, r12, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_SHA256_transform_len_start\n\t" #else - "BNE.N L_SHA256_transform_len_start\n\t" + "BNE.W L_SHA256_transform_len_start\n\t" #endif /* Round 0 */ "LDR r5, [%[sha256], #16]\n\t" @@ -1449,19 +1449,20 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "SUBS %[len], %[len], #0x40\n\t" "SUB r3, r3, #0xc0\n\t" "ADD %[data], %[data], #0x40\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_SHA256_transform_len_begin\n\t" #else - "BNE.N L_SHA256_transform_len_begin\n\t" + "BNE.W L_SHA256_transform_len_begin\n\t" #endif "ADD sp, sp, #0xc0\n\t" - : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) -#else - [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), + [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) : +#else + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S index bd6e66793..6031b9240 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S @@ -2319,10 +2319,10 @@ L_SHA512_transform_len_start: STRD r4, r5, [sp, #120] ADD r3, r3, #0x80 SUBS r12, r12, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA512_transform_len_start #else - BNE.N L_SHA512_transform_len_start + BNE.W L_SHA512_transform_len_start #endif /* Round 0 */ LDRD r4, r5, [r0, #32] @@ -3656,10 +3656,10 @@ L_SHA512_transform_len_start: SUBS r2, r2, #0x80 SUB r3, r3, #0x200 ADD r1, r1, #0x80 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA512_transform_len_begin #else - BNE.N L_SHA512_transform_len_begin + BNE.W L_SHA512_transform_len_begin #endif EOR r0, r0, r0 ADD sp, sp, #0xc0 diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c index 4b0407a0b..7521b35fa 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -2226,10 +2226,10 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "STRD r4, r5, [sp, #120]\n\t" "ADD r3, r3, #0x80\n\t" "SUBS r12, r12, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_SHA512_transform_len_start\n\t" #else - "BNE.N L_SHA512_transform_len_start\n\t" + "BNE.W L_SHA512_transform_len_start\n\t" #endif /* Round 0 */ "LDRD r4, r5, [%[sha512], #32]\n\t" @@ -3563,20 +3563,21 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "SUBS %[len], %[len], #0x80\n\t" "SUB r3, r3, #0x200\n\t" "ADD %[data], %[data], #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ "BNE L_SHA512_transform_len_begin\n\t" #else - "BNE.N L_SHA512_transform_len_begin\n\t" + "BNE.W L_SHA512_transform_len_begin\n\t" #endif "EOR r0, r0, r0\n\t" "ADD sp, sp, #0xc0\n\t" - : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), #ifndef WOLFSSL_NO_VAR_ASSIGN_REG - [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) -#else - [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) -#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), + [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) : +#else + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index 0459375b1..54423c2d5 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -5404,10 +5404,13 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_mul_64_outer_%=: \n\t" "subs r3, r5, #0xfc\n\t" @@ -5452,13 +5455,86 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x100\n\t" - "beq L_sp_2048_mul_64_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_mul_64_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_mul_64_inner_done_%=\n\t" + "blt L_sp_2048_mul_64_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_2048_mul_64_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -5466,14 +5542,46 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x1f8\n\t" + "cmp r5, #0x1f4\n\t" "ble L_sp_2048_mul_64_outer_%=\n\t" + "ldr lr, [%[a], #252]\n\t" + "ldr r11, [%[b], #252]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_64_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_mul_64_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -5493,10 +5601,12 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_sqr_64_outer_%=: \n\t" "subs r3, r5, #0xfc\n\t" @@ -5505,8 +5615,6 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_64_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_2048_sqr_64_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -5558,9 +5666,11 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_2048_sqr_64_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_64_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_sqr_64_inner_done_%=\n\t" + "blt L_sp_2048_sqr_64_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -5589,30 +5699,46 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_2048_sqr_64_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x100\n\t" - "beq L_sp_2048_sqr_64_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_2048_sqr_64_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_sqr_64_inner_%=\n\t" - "\n" "L_sp_2048_sqr_64_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x1f8\n\t" + "cmp r5, #0x1f4\n\t" "ble L_sp_2048_sqr_64_outer_%=\n\t" + "ldr lr, [%[a], #252]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_64_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_sqr_64_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -5729,10 +5855,13 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_mul_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -5777,13 +5906,86 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_2048_mul_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_mul_32_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_mul_32_inner_done_%=\n\t" + "blt L_sp_2048_mul_32_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_2048_mul_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -5791,14 +5993,46 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_2048_mul_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" + "ldr r11, [%[b], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_mul_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -5818,10 +6052,12 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -5830,8 +6066,6 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_32_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_2048_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -5883,9 +6117,11 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_2048_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_32_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_sqr_32_inner_done_%=\n\t" + "blt L_sp_2048_sqr_32_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -5914,30 +6150,46 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_2048_sqr_32_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_2048_sqr_32_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_2048_sqr_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_sqr_32_inner_%=\n\t" - "\n" "L_sp_2048_sqr_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_2048_sqr_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -28089,10 +28341,13 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_mul_96_outer_%=: \n\t" "subs r3, r5, #0x17c\n\t" @@ -28137,13 +28392,86 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x180\n\t" - "beq L_sp_3072_mul_96_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_mul_96_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_mul_96_inner_done_%=\n\t" + "blt L_sp_3072_mul_96_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_3072_mul_96_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -28151,14 +28479,46 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x2f8\n\t" + "cmp r5, #0x2f4\n\t" "ble L_sp_3072_mul_96_outer_%=\n\t" + "ldr lr, [%[a], #380]\n\t" + "ldr r11, [%[b], #380]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_96_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_mul_96_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -28178,10 +28538,12 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_sqr_96_outer_%=: \n\t" "subs r3, r5, #0x17c\n\t" @@ -28190,8 +28552,6 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_96_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_3072_sqr_96_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -28243,9 +28603,11 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_3072_sqr_96_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_96_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_sqr_96_inner_done_%=\n\t" + "blt L_sp_3072_sqr_96_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -28274,30 +28636,46 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_3072_sqr_96_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x180\n\t" - "beq L_sp_3072_sqr_96_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_3072_sqr_96_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_sqr_96_inner_%=\n\t" - "\n" "L_sp_3072_sqr_96_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x2f8\n\t" + "cmp r5, #0x2f4\n\t" "ble L_sp_3072_sqr_96_outer_%=\n\t" + "ldr lr, [%[a], #380]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_96_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_sqr_96_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -28414,10 +28792,13 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_mul_48_outer_%=: \n\t" "subs r3, r5, #0xbc\n\t" @@ -28462,13 +28843,86 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0xc0\n\t" - "beq L_sp_3072_mul_48_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_mul_48_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_mul_48_inner_done_%=\n\t" + "blt L_sp_3072_mul_48_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_3072_mul_48_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -28476,14 +28930,46 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x178\n\t" + "cmp r5, #0x174\n\t" "ble L_sp_3072_mul_48_outer_%=\n\t" + "ldr lr, [%[a], #188]\n\t" + "ldr r11, [%[b], #188]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_48_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_mul_48_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -28503,10 +28989,12 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_sqr_48_outer_%=: \n\t" "subs r3, r5, #0xbc\n\t" @@ -28515,8 +29003,6 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_48_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_3072_sqr_48_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -28568,9 +29054,11 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_3072_sqr_48_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_48_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_sqr_48_inner_done_%=\n\t" + "blt L_sp_3072_sqr_48_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -28599,30 +29087,46 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_3072_sqr_48_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0xc0\n\t" - "beq L_sp_3072_sqr_48_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_3072_sqr_48_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_sqr_48_inner_%=\n\t" - "\n" "L_sp_3072_sqr_48_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x178\n\t" + "cmp r5, #0x174\n\t" "ble L_sp_3072_sqr_48_outer_%=\n\t" + "ldr lr, [%[a], #188]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_48_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_sqr_48_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -46059,10 +46563,13 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_4096_mul_128_outer_%=: \n\t" "subs r3, r5, #0x1fc\n\t" @@ -46107,13 +46614,86 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x200\n\t" - "beq L_sp_4096_mul_128_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_4096_mul_128_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_4096_mul_128_inner_done_%=\n\t" + "blt L_sp_4096_mul_128_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_4096_mul_128_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -46121,14 +46701,46 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x3f8\n\t" + "cmp r5, #0x3f4\n\t" "ble L_sp_4096_mul_128_outer_%=\n\t" + "ldr lr, [%[a], #508]\n\t" + "ldr r11, [%[b], #508]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_4096_mul_128_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_4096_mul_128_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -46148,10 +46760,12 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_4096_sqr_128_outer_%=: \n\t" "subs r3, r5, #0x1fc\n\t" @@ -46160,8 +46774,6 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_4096_sqr_128_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_4096_sqr_128_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -46213,9 +46825,11 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_4096_sqr_128_op_done_%=\n\t" - "\n" - "L_sp_4096_sqr_128_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_4096_sqr_128_inner_done_%=\n\t" + "blt L_sp_4096_sqr_128_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -46244,30 +46858,46 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_4096_sqr_128_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x200\n\t" - "beq L_sp_4096_sqr_128_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_4096_sqr_128_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_4096_sqr_128_inner_%=\n\t" - "\n" "L_sp_4096_sqr_128_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x3f8\n\t" + "cmp r5, #0x3f4\n\t" "ble L_sp_4096_sqr_128_outer_%=\n\t" + "ldr lr, [%[a], #508]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_4096_sqr_128_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_4096_sqr_128_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -60832,10 +61462,13 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_256_mul_8_outer_%=: \n\t" "subs r3, r5, #28\n\t" @@ -60880,13 +61513,86 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #32\n\t" - "beq L_sp_256_mul_8_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_256_mul_8_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_256_mul_8_inner_done_%=\n\t" + "blt L_sp_256_mul_8_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_256_mul_8_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -60894,14 +61600,46 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #56\n\t" + "cmp r5, #52\n\t" "ble L_sp_256_mul_8_outer_%=\n\t" + "ldr lr, [%[a], #28]\n\t" + "ldr r11, [%[b], #28]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_256_mul_8_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_256_mul_8_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -63404,10 +64142,12 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_256_sqr_8_outer_%=: \n\t" "subs r3, r5, #28\n\t" @@ -63416,8 +64156,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_256_sqr_8_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_256_sqr_8_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -63469,9 +64207,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_256_sqr_8_op_done_%=\n\t" - "\n" - "L_sp_256_sqr_8_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_256_sqr_8_inner_done_%=\n\t" + "blt L_sp_256_sqr_8_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -63500,30 +64240,46 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_256_sqr_8_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #32\n\t" - "beq L_sp_256_sqr_8_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_256_sqr_8_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_256_sqr_8_inner_%=\n\t" - "\n" "L_sp_256_sqr_8_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #56\n\t" + "cmp r5, #52\n\t" "ble L_sp_256_sqr_8_outer_%=\n\t" + "ldr lr, [%[a], #28]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_256_sqr_8_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_256_sqr_8_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -79029,10 +79785,13 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_384_mul_12_outer_%=: \n\t" "subs r3, r5, #44\n\t" @@ -79077,13 +79836,86 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #48\n\t" - "beq L_sp_384_mul_12_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_384_mul_12_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_384_mul_12_inner_done_%=\n\t" + "blt L_sp_384_mul_12_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_384_mul_12_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -79091,14 +79923,46 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x58\n\t" + "cmp r5, #0x54\n\t" "ble L_sp_384_mul_12_outer_%=\n\t" + "ldr lr, [%[a], #44]\n\t" + "ldr r11, [%[b], #44]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_384_mul_12_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_384_mul_12_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -84617,10 +85481,12 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_384_sqr_12_outer_%=: \n\t" "subs r3, r5, #44\n\t" @@ -84629,8 +85495,6 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_384_sqr_12_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_384_sqr_12_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -84682,9 +85546,11 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_384_sqr_12_op_done_%=\n\t" - "\n" - "L_sp_384_sqr_12_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_384_sqr_12_inner_done_%=\n\t" + "blt L_sp_384_sqr_12_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -84713,30 +85579,46 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_384_sqr_12_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #48\n\t" - "beq L_sp_384_sqr_12_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_384_sqr_12_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_384_sqr_12_inner_%=\n\t" - "\n" "L_sp_384_sqr_12_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x58\n\t" + "cmp r5, #0x54\n\t" "ble L_sp_384_sqr_12_outer_%=\n\t" + "ldr lr, [%[a], #44]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_384_sqr_12_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_384_sqr_12_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -97021,10 +97903,13 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_521_mul_17_outer_%=: \n\t" "subs r3, r5, #0x40\n\t" @@ -97069,13 +97954,86 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x44\n\t" - "beq L_sp_521_mul_17_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_521_mul_17_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_521_mul_17_inner_done_%=\n\t" + "blt L_sp_521_mul_17_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_521_mul_17_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -97083,17 +98041,49 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x80\n\t" + "cmp r5, #0x7c\n\t" "ble L_sp_521_mul_17_outer_%=\n\t" + "ldr lr, [%[a], #64]\n\t" + "ldr r11, [%[b], #64]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "ldm sp!, {r6, r7}\n\t" "stm %[r]!, {r6, r7}\n\t" "sub r5, r5, #8\n\t" "\n" "L_sp_521_mul_17_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_521_mul_17_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -108131,10 +109121,12 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_521_sqr_17_outer_%=: \n\t" "subs r3, r5, #0x40\n\t" @@ -108143,8 +109135,6 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_521_sqr_17_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_521_sqr_17_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -108196,9 +109186,11 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_521_sqr_17_op_done_%=\n\t" - "\n" - "L_sp_521_sqr_17_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_521_sqr_17_inner_done_%=\n\t" + "blt L_sp_521_sqr_17_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -108227,33 +109219,49 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_521_sqr_17_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x44\n\t" - "beq L_sp_521_sqr_17_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_521_sqr_17_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_521_sqr_17_inner_%=\n\t" - "\n" "L_sp_521_sqr_17_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x80\n\t" + "cmp r5, #0x7c\n\t" "ble L_sp_521_sqr_17_outer_%=\n\t" + "ldr lr, [%[a], #64]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "ldm sp!, {r6, r7}\n\t" "stm %[r]!, {r6, r7}\n\t" "sub r5, r5, #8\n\t" "\n" "L_sp_521_sqr_17_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_521_sqr_17_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -141064,10 +142072,13 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_1024_mul_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -141112,13 +142123,86 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_1024_mul_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_1024_mul_32_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_1024_mul_32_inner_done_%=\n\t" + "blt L_sp_1024_mul_32_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_1024_mul_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -141126,14 +142210,46 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_1024_mul_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" + "ldr r11, [%[b], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_1024_mul_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_1024_mul_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -141153,10 +142269,12 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_1024_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -141165,8 +142283,6 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_1024_sqr_32_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_1024_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -141218,9 +142334,11 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_1024_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_1024_sqr_32_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_1024_sqr_32_inner_done_%=\n\t" + "blt L_sp_1024_sqr_32_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -141249,30 +142367,46 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_1024_sqr_32_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_1024_sqr_32_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_1024_sqr_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_1024_sqr_32_inner_%=\n\t" - "\n" "L_sp_1024_sqr_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_1024_sqr_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_1024_sqr_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_1024_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index 30a4d7260..af612beba 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -2316,15 +2316,18 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x200\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_2048_mul_64_outer:\n\t" "SUBS r3, r5, #0xfc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_2048_mul_64_inner:\n\t" @@ -2334,20 +2337,31 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x100\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_mul_64_inner_done\n\t" + "BGT L_sp_2048_mul_64_inner_done\n\t" #else - "BEQ.N L_sp_2048_mul_64_inner_done\n\t" + "BGT.N L_sp_2048_mul_64_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_64_inner\n\t" + "BLT L_sp_2048_mul_64_inner\n\t" #else - "BLE.N L_sp_2048_mul_64_inner\n\t" + "BLT.N L_sp_2048_mul_64_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_2048_mul_64_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -2355,18 +2369,23 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x1f8\n\t" + "CMP r5, #0x1f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_2048_mul_64_outer\n\t" #else "BLE.N L_sp_2048_mul_64_outer\n\t" #endif + "LDR lr, [%[a], #252]\n\t" + "LDR r11, [%[b], #252]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_64_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_mul_64_store\n\t" #else @@ -2396,24 +2415,20 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x200\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_2048_sqr_64_outer:\n\t" "SUBS r3, r5, #0xfc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_64_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_64_op_sqr\n\t" -#else - "BEQ.N L_sp_2048_sqr_64_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -2423,36 +2438,24 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_2048_sqr_64_op_done\n\t" - "\n" - "L_sp_2048_sqr_64_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_2048_sqr_64_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x100\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_64_inner_done\n\t" -#else - "BEQ.N L_sp_2048_sqr_64_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_sqr_64_inner_done\n\t" #else "BGT.N L_sp_2048_sqr_64_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_64_inner\n\t" + "BLT L_sp_2048_sqr_64_inner\n\t" #else - "BLE.N L_sp_2048_sqr_64_inner\n\t" + "BLT.N L_sp_2048_sqr_64_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_2048_sqr_64_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -2460,18 +2463,22 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x1f8\n\t" + "CMP r5, #0x1f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_2048_sqr_64_outer\n\t" #else "BLE.N L_sp_2048_sqr_64_outer\n\t" #endif + "LDR lr, [%[a], #252]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_64_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_sqr_64_store\n\t" #else @@ -2618,15 +2625,18 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_2048_mul_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_2048_mul_32_inner:\n\t" @@ -2636,20 +2646,31 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_mul_32_inner_done\n\t" + "BGT L_sp_2048_mul_32_inner_done\n\t" #else - "BEQ.N L_sp_2048_mul_32_inner_done\n\t" + "BGT.N L_sp_2048_mul_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_32_inner\n\t" + "BLT L_sp_2048_mul_32_inner\n\t" #else - "BLE.N L_sp_2048_mul_32_inner\n\t" + "BLT.N L_sp_2048_mul_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_2048_mul_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -2657,18 +2678,23 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_2048_mul_32_outer\n\t" #else "BLE.N L_sp_2048_mul_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "LDR r11, [%[b], #124]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_32_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_mul_32_store\n\t" #else @@ -2698,24 +2724,20 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_2048_sqr_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_32_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_32_op_sqr\n\t" -#else - "BEQ.N L_sp_2048_sqr_32_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -2725,36 +2747,24 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_2048_sqr_32_op_done\n\t" - "\n" - "L_sp_2048_sqr_32_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_2048_sqr_32_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_32_inner_done\n\t" -#else - "BEQ.N L_sp_2048_sqr_32_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_sqr_32_inner_done\n\t" #else "BGT.N L_sp_2048_sqr_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_32_inner\n\t" + "BLT L_sp_2048_sqr_32_inner\n\t" #else - "BLE.N L_sp_2048_sqr_32_inner\n\t" + "BLT.N L_sp_2048_sqr_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_2048_sqr_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -2762,18 +2772,22 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_2048_sqr_32_outer\n\t" #else "BLE.N L_sp_2048_sqr_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_32_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_2048_sqr_32_store\n\t" #else @@ -13039,15 +13053,18 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x300\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_3072_mul_96_outer:\n\t" "SUBS r3, r5, #0x17c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_3072_mul_96_inner:\n\t" @@ -13057,20 +13074,31 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x180\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_mul_96_inner_done\n\t" + "BGT L_sp_3072_mul_96_inner_done\n\t" #else - "BEQ.N L_sp_3072_mul_96_inner_done\n\t" + "BGT.N L_sp_3072_mul_96_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_96_inner\n\t" + "BLT L_sp_3072_mul_96_inner\n\t" #else - "BLE.N L_sp_3072_mul_96_inner\n\t" + "BLT.N L_sp_3072_mul_96_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_3072_mul_96_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -13078,18 +13106,23 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x2f8\n\t" + "CMP r5, #0x2f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_3072_mul_96_outer\n\t" #else "BLE.N L_sp_3072_mul_96_outer\n\t" #endif + "LDR lr, [%[a], #380]\n\t" + "LDR r11, [%[b], #380]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_96_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_mul_96_store\n\t" #else @@ -13119,24 +13152,20 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x300\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_3072_sqr_96_outer:\n\t" "SUBS r3, r5, #0x17c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_96_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_96_op_sqr\n\t" -#else - "BEQ.N L_sp_3072_sqr_96_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -13146,36 +13175,24 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_3072_sqr_96_op_done\n\t" - "\n" - "L_sp_3072_sqr_96_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_3072_sqr_96_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x180\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_96_inner_done\n\t" -#else - "BEQ.N L_sp_3072_sqr_96_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_sqr_96_inner_done\n\t" #else "BGT.N L_sp_3072_sqr_96_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_96_inner\n\t" + "BLT L_sp_3072_sqr_96_inner\n\t" #else - "BLE.N L_sp_3072_sqr_96_inner\n\t" + "BLT.N L_sp_3072_sqr_96_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_3072_sqr_96_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -13183,18 +13200,22 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x2f8\n\t" + "CMP r5, #0x2f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_3072_sqr_96_outer\n\t" #else "BLE.N L_sp_3072_sqr_96_outer\n\t" #endif + "LDR lr, [%[a], #380]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_96_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_sqr_96_store\n\t" #else @@ -13341,15 +13362,18 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x180\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_3072_mul_48_outer:\n\t" "SUBS r3, r5, #0xbc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_3072_mul_48_inner:\n\t" @@ -13359,20 +13383,31 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0xc0\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_mul_48_inner_done\n\t" + "BGT L_sp_3072_mul_48_inner_done\n\t" #else - "BEQ.N L_sp_3072_mul_48_inner_done\n\t" + "BGT.N L_sp_3072_mul_48_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_48_inner\n\t" + "BLT L_sp_3072_mul_48_inner\n\t" #else - "BLE.N L_sp_3072_mul_48_inner\n\t" + "BLT.N L_sp_3072_mul_48_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_3072_mul_48_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -13380,18 +13415,23 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x178\n\t" + "CMP r5, #0x174\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_3072_mul_48_outer\n\t" #else "BLE.N L_sp_3072_mul_48_outer\n\t" #endif + "LDR lr, [%[a], #188]\n\t" + "LDR r11, [%[b], #188]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_48_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_mul_48_store\n\t" #else @@ -13421,24 +13461,20 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x180\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_3072_sqr_48_outer:\n\t" "SUBS r3, r5, #0xbc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_48_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_48_op_sqr\n\t" -#else - "BEQ.N L_sp_3072_sqr_48_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -13448,36 +13484,24 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_3072_sqr_48_op_done\n\t" - "\n" - "L_sp_3072_sqr_48_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_3072_sqr_48_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0xc0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_48_inner_done\n\t" -#else - "BEQ.N L_sp_3072_sqr_48_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_sqr_48_inner_done\n\t" #else "BGT.N L_sp_3072_sqr_48_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_48_inner\n\t" + "BLT L_sp_3072_sqr_48_inner\n\t" #else - "BLE.N L_sp_3072_sqr_48_inner\n\t" + "BLT.N L_sp_3072_sqr_48_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_3072_sqr_48_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -13485,18 +13509,22 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x178\n\t" + "CMP r5, #0x174\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_3072_sqr_48_outer\n\t" #else "BLE.N L_sp_3072_sqr_48_outer\n\t" #endif + "LDR lr, [%[a], #188]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_48_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_3072_sqr_48_store\n\t" #else @@ -23136,15 +23164,18 @@ static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x400\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_4096_mul_128_outer:\n\t" "SUBS r3, r5, #0x1fc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_4096_mul_128_inner:\n\t" @@ -23154,20 +23185,31 @@ static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x200\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_mul_128_inner_done\n\t" + "BGT L_sp_4096_mul_128_inner_done\n\t" #else - "BEQ.N L_sp_4096_mul_128_inner_done\n\t" + "BGT.N L_sp_4096_mul_128_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_mul_128_inner\n\t" + "BLT L_sp_4096_mul_128_inner\n\t" #else - "BLE.N L_sp_4096_mul_128_inner\n\t" + "BLT.N L_sp_4096_mul_128_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_4096_mul_128_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -23175,18 +23217,23 @@ static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x3f8\n\t" + "CMP r5, #0x3f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_4096_mul_128_outer\n\t" #else "BLE.N L_sp_4096_mul_128_outer\n\t" #endif + "LDR lr, [%[a], #508]\n\t" + "LDR r11, [%[b], #508]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_4096_mul_128_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_4096_mul_128_store\n\t" #else @@ -23216,24 +23263,20 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x400\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_4096_sqr_128_outer:\n\t" "SUBS r3, r5, #0x1fc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_4096_sqr_128_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_sqr_128_op_sqr\n\t" -#else - "BEQ.N L_sp_4096_sqr_128_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -23243,36 +23286,24 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_4096_sqr_128_op_done\n\t" - "\n" - "L_sp_4096_sqr_128_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_4096_sqr_128_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x200\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_sqr_128_inner_done\n\t" -#else - "BEQ.N L_sp_4096_sqr_128_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_4096_sqr_128_inner_done\n\t" #else "BGT.N L_sp_4096_sqr_128_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_sqr_128_inner\n\t" + "BLT L_sp_4096_sqr_128_inner\n\t" #else - "BLE.N L_sp_4096_sqr_128_inner\n\t" + "BLT.N L_sp_4096_sqr_128_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_4096_sqr_128_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -23280,18 +23311,22 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x3f8\n\t" + "CMP r5, #0x3f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_4096_sqr_128_outer\n\t" #else "BLE.N L_sp_4096_sqr_128_outer\n\t" #endif + "LDR lr, [%[a], #508]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_4096_sqr_128_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_4096_sqr_128_store\n\t" #else @@ -30826,15 +30861,18 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x40\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_256_mul_8_outer:\n\t" "SUBS r3, r5, #0x1c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_256_mul_8_inner:\n\t" @@ -30844,20 +30882,31 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x20\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_mul_8_inner_done\n\t" + "BGT L_sp_256_mul_8_inner_done\n\t" #else - "BEQ.N L_sp_256_mul_8_inner_done\n\t" + "BGT.N L_sp_256_mul_8_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_mul_8_inner\n\t" + "BLT L_sp_256_mul_8_inner\n\t" #else - "BLE.N L_sp_256_mul_8_inner\n\t" + "BLT.N L_sp_256_mul_8_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_256_mul_8_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -30865,18 +30914,23 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x38\n\t" + "CMP r5, #0x34\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_256_mul_8_outer\n\t" #else "BLE.N L_sp_256_mul_8_outer\n\t" #endif + "LDR lr, [%[a], #28]\n\t" + "LDR r11, [%[b], #28]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_256_mul_8_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_256_mul_8_store\n\t" #else @@ -31412,24 +31466,20 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x40\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_256_sqr_8_outer:\n\t" "SUBS r3, r5, #0x1c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_256_sqr_8_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_sqr_8_op_sqr\n\t" -#else - "BEQ.N L_sp_256_sqr_8_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -31439,36 +31489,24 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_256_sqr_8_op_done\n\t" - "\n" - "L_sp_256_sqr_8_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_256_sqr_8_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x20\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_sqr_8_inner_done\n\t" -#else - "BEQ.N L_sp_256_sqr_8_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_256_sqr_8_inner_done\n\t" #else "BGT.N L_sp_256_sqr_8_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_sqr_8_inner\n\t" + "BLT L_sp_256_sqr_8_inner\n\t" #else - "BLE.N L_sp_256_sqr_8_inner\n\t" + "BLT.N L_sp_256_sqr_8_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_256_sqr_8_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -31476,18 +31514,22 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x38\n\t" + "CMP r5, #0x34\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_256_sqr_8_outer\n\t" #else "BLE.N L_sp_256_sqr_8_outer\n\t" #endif + "LDR lr, [%[a], #28]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_256_sqr_8_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_256_sqr_8_store\n\t" #else @@ -40204,7 +40246,11 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m) "ADCS r6, r6, r10\n\t" "ADCS r7, r7, r11\n\t" "ADC r3, r12, r12\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_div2_mod_8_div2\n\t" +#else + "B.N L_sp_256_div2_mod_8_div2\n\t" +#endif "\n" "L_sp_256_div2_mod_8_even:\n\t" "LDRD r4, r5, [%[a], #12]\n\t" @@ -40260,7 +40306,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_7:\n\t" "LDR r1, [%[a], #24]\n\t" @@ -40273,7 +40323,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_6:\n\t" "LDR r1, [%[a], #20]\n\t" @@ -40286,7 +40340,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_5:\n\t" "LDR r1, [%[a], #16]\n\t" @@ -40299,7 +40357,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_4:\n\t" "LDR r1, [%[a], #12]\n\t" @@ -40312,7 +40374,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_3:\n\t" "LDR r1, [%[a], #8]\n\t" @@ -40325,7 +40391,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_2:\n\t" "LDR r1, [%[a], #4]\n\t" @@ -40338,7 +40408,11 @@ static int sp_256_num_bits_8(const sp_digit* a) "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" "L_sp_256_num_bits_8_1:\n\t" "LDR r1, [%[a]]\n\t" @@ -41461,15 +41535,18 @@ static void sp_384_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x60\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_384_mul_12_outer:\n\t" "SUBS r3, r5, #0x2c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_384_mul_12_inner:\n\t" @@ -41479,20 +41556,31 @@ static void sp_384_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x30\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_mul_12_inner_done\n\t" + "BGT L_sp_384_mul_12_inner_done\n\t" #else - "BEQ.N L_sp_384_mul_12_inner_done\n\t" + "BGT.N L_sp_384_mul_12_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_mul_12_inner\n\t" + "BLT L_sp_384_mul_12_inner\n\t" #else - "BLE.N L_sp_384_mul_12_inner\n\t" + "BLT.N L_sp_384_mul_12_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_384_mul_12_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -41500,18 +41588,23 @@ static void sp_384_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x58\n\t" + "CMP r5, #0x54\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_384_mul_12_outer\n\t" #else "BLE.N L_sp_384_mul_12_outer\n\t" #endif + "LDR lr, [%[a], #44]\n\t" + "LDR r11, [%[b], #44]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_384_mul_12_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_384_mul_12_store\n\t" #else @@ -42571,24 +42664,20 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x60\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_384_sqr_12_outer:\n\t" "SUBS r3, r5, #0x2c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_384_sqr_12_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_sqr_12_op_sqr\n\t" -#else - "BEQ.N L_sp_384_sqr_12_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -42598,36 +42687,24 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_384_sqr_12_op_done\n\t" - "\n" - "L_sp_384_sqr_12_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_384_sqr_12_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x30\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_sqr_12_inner_done\n\t" -#else - "BEQ.N L_sp_384_sqr_12_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_384_sqr_12_inner_done\n\t" #else "BGT.N L_sp_384_sqr_12_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_sqr_12_inner\n\t" + "BLT L_sp_384_sqr_12_inner\n\t" #else - "BLE.N L_sp_384_sqr_12_inner\n\t" + "BLT.N L_sp_384_sqr_12_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_384_sqr_12_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -42635,18 +42712,22 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x58\n\t" + "CMP r5, #0x54\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_384_sqr_12_outer\n\t" #else "BLE.N L_sp_384_sqr_12_outer\n\t" #endif + "LDR lr, [%[a], #44]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_384_sqr_12_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_384_sqr_12_store\n\t" #else @@ -49928,7 +50009,11 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m "ADCS r7, r7, r11\n\t" "STM %[r]!, {r4, r5, r6, r7}\n\t" "ADC r3, r12, r12\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_div2_mod_12_div2\n\t" +#else + "B.N L_sp_384_div2_mod_12_div2\n\t" +#endif "\n" "L_sp_384_div2_mod_12_even:\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" @@ -50014,7 +50099,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x180\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_11:\n\t" "LDR r1, [%[a], #40]\n\t" @@ -50027,7 +50116,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x160\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_10:\n\t" "LDR r1, [%[a], #36]\n\t" @@ -50040,7 +50133,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x140\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_9:\n\t" "LDR r1, [%[a], #32]\n\t" @@ -50053,7 +50150,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x120\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_8:\n\t" "LDR r1, [%[a], #28]\n\t" @@ -50066,7 +50167,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_7:\n\t" "LDR r1, [%[a], #24]\n\t" @@ -50079,7 +50184,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_6:\n\t" "LDR r1, [%[a], #20]\n\t" @@ -50092,7 +50201,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_5:\n\t" "LDR r1, [%[a], #16]\n\t" @@ -50105,7 +50218,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_4:\n\t" "LDR r1, [%[a], #12]\n\t" @@ -50118,7 +50235,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_3:\n\t" "LDR r1, [%[a], #8]\n\t" @@ -50131,7 +50252,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_2:\n\t" "LDR r1, [%[a], #4]\n\t" @@ -50144,7 +50269,11 @@ static int sp_384_num_bits_12(const sp_digit* a) "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" "L_sp_384_num_bits_12_1:\n\t" "LDR r1, [%[a]]\n\t" @@ -51313,15 +51442,18 @@ static void sp_521_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x88\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_521_mul_17_outer:\n\t" "SUBS r3, r5, #0x40\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_521_mul_17_inner:\n\t" @@ -51331,20 +51463,31 @@ static void sp_521_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x44\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_mul_17_inner_done\n\t" + "BGT L_sp_521_mul_17_inner_done\n\t" #else - "BEQ.N L_sp_521_mul_17_inner_done\n\t" + "BGT.N L_sp_521_mul_17_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_mul_17_inner\n\t" + "BLT L_sp_521_mul_17_inner\n\t" #else - "BLE.N L_sp_521_mul_17_inner\n\t" + "BLT.N L_sp_521_mul_17_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_521_mul_17_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -51352,21 +51495,26 @@ static void sp_521_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x80\n\t" + "CMP r5, #0x7c\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_521_mul_17_outer\n\t" #else "BLE.N L_sp_521_mul_17_outer\n\t" #endif + "LDR lr, [%[a], #64]\n\t" + "LDR r11, [%[b], #64]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "LDM sp!, {r6, r7}\n\t" "STM %[r]!, {r6, r7}\n\t" "SUB r5, r5, #0x8\n\t" "\n" "L_sp_521_mul_17_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_521_mul_17_store\n\t" #else @@ -53440,24 +53588,20 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x88\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_521_sqr_17_outer:\n\t" "SUBS r3, r5, #0x40\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_521_sqr_17_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_sqr_17_op_sqr\n\t" -#else - "BEQ.N L_sp_521_sqr_17_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -53467,36 +53611,24 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_521_sqr_17_op_done\n\t" - "\n" - "L_sp_521_sqr_17_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_521_sqr_17_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x44\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_sqr_17_inner_done\n\t" -#else - "BEQ.N L_sp_521_sqr_17_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_521_sqr_17_inner_done\n\t" #else "BGT.N L_sp_521_sqr_17_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_sqr_17_inner\n\t" + "BLT L_sp_521_sqr_17_inner\n\t" #else - "BLE.N L_sp_521_sqr_17_inner\n\t" + "BLT.N L_sp_521_sqr_17_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_521_sqr_17_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -53504,21 +53636,25 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x80\n\t" + "CMP r5, #0x7c\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_521_sqr_17_outer\n\t" #else "BLE.N L_sp_521_sqr_17_outer\n\t" #endif + "LDR lr, [%[a], #64]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "LDM sp!, {r6, r7}\n\t" "STM %[r]!, {r6, r7}\n\t" "SUB r5, r5, #0x8\n\t" "\n" "L_sp_521_sqr_17_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_521_sqr_17_store\n\t" #else @@ -63081,7 +63217,11 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m "ADCS r4, r4, r8\n\t" "STM %[r]!, {r4}\n\t" "ADC r3, r12, r12\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_div2_mod_17_div2\n\t" +#else + "B.N L_sp_521_div2_mod_17_div2\n\t" +#endif "\n" "L_sp_521_div2_mod_17_even:\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" @@ -63191,7 +63331,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x220\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_16:\n\t" "LDR r1, [%[a], #60]\n\t" @@ -63204,7 +63348,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x200\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_15:\n\t" "LDR r1, [%[a], #56]\n\t" @@ -63217,7 +63365,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x1e0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_14:\n\t" "LDR r1, [%[a], #52]\n\t" @@ -63230,7 +63382,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x1c0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_13:\n\t" "LDR r1, [%[a], #48]\n\t" @@ -63243,7 +63399,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x1a0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_12:\n\t" "LDR r1, [%[a], #44]\n\t" @@ -63256,7 +63416,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x180\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_11:\n\t" "LDR r1, [%[a], #40]\n\t" @@ -63269,7 +63433,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x160\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_10:\n\t" "LDR r1, [%[a], #36]\n\t" @@ -63282,7 +63450,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x140\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_9:\n\t" "LDR r1, [%[a], #32]\n\t" @@ -63295,7 +63467,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x120\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_8:\n\t" "LDR r1, [%[a], #28]\n\t" @@ -63308,7 +63484,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_7:\n\t" "LDR r1, [%[a], #24]\n\t" @@ -63321,7 +63501,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_6:\n\t" "LDR r1, [%[a], #20]\n\t" @@ -63334,7 +63518,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_5:\n\t" "LDR r1, [%[a], #16]\n\t" @@ -63347,7 +63535,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_4:\n\t" "LDR r1, [%[a], #12]\n\t" @@ -63360,7 +63552,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_3:\n\t" "LDR r1, [%[a], #8]\n\t" @@ -63373,7 +63569,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_2:\n\t" "LDR r1, [%[a], #4]\n\t" @@ -63386,7 +63586,11 @@ static int sp_521_num_bits_17(const sp_digit* a) "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" "L_sp_521_num_bits_17_1:\n\t" "LDR r1, [%[a]]\n\t" @@ -67781,15 +67985,18 @@ static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_1024_mul_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_1024_mul_32_inner:\n\t" @@ -67799,20 +68006,31 @@ static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_mul_32_inner_done\n\t" + "BGT L_sp_1024_mul_32_inner_done\n\t" #else - "BEQ.N L_sp_1024_mul_32_inner_done\n\t" + "BGT.N L_sp_1024_mul_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_mul_32_inner\n\t" + "BLT L_sp_1024_mul_32_inner\n\t" #else - "BLE.N L_sp_1024_mul_32_inner\n\t" + "BLT.N L_sp_1024_mul_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_1024_mul_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -67820,18 +68038,23 @@ static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_1024_mul_32_outer\n\t" #else "BLE.N L_sp_1024_mul_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "LDR r11, [%[b], #124]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_1024_mul_32_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_1024_mul_32_store\n\t" #else @@ -67861,24 +68084,20 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" "L_sp_1024_sqr_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" "L_sp_1024_sqr_32_inner:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_sqr_32_op_sqr\n\t" -#else - "BEQ.N L_sp_1024_sqr_32_op_sqr\n\t" -#endif "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -67888,36 +68107,24 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_1024_sqr_32_op_done\n\t" - "\n" - "L_sp_1024_sqr_32_op_sqr:\n\t" - "LDR lr, [%[a], r3]\n\t" - "UMULL r9, r10, lr, lr\n\t" - "ADDS r6, r6, r9\n\t" - "ADCS r7, r7, r10\n\t" - "ADC r8, r8, #0x0\n\t" - "\n" - "L_sp_1024_sqr_32_op_done:\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_sqr_32_inner_done\n\t" -#else - "BEQ.N L_sp_1024_sqr_32_inner_done\n\t" -#endif "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_1024_sqr_32_inner_done\n\t" #else "BGT.N L_sp_1024_sqr_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_sqr_32_inner\n\t" + "BLT L_sp_1024_sqr_32_inner\n\t" #else - "BLE.N L_sp_1024_sqr_32_inner\n\t" + "BLT.N L_sp_1024_sqr_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "UMULL r9, r10, lr, lr\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" "L_sp_1024_sqr_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" @@ -67925,18 +68132,22 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BLE L_sp_1024_sqr_32_outer\n\t" #else "BLE.N L_sp_1024_sqr_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" "L_sp_1024_sqr_32_store:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) "BGT L_sp_1024_sqr_32_store\n\t" #else