diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm.S b/wolfcrypt/src/port/arm/thumb2-aes-asm.S index f483f87de..0badf8f97 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm.S @@ -670,13 +670,13 @@ L_AES_invert_key_mix_loop: EOR r8, r8, r9, ROR #24 STR r8, [r0], #4 SUBS r11, r11, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_invert_key_mix_loop #else - BNE.N L_AES_invert_key_mix_loop + BNE.W L_AES_invert_key_mix_loop #endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 165 + /* Cycle Count = 165 */ .size AES_invert_key,.-AES_invert_key #endif /* HAVE_AES_DECRYPT */ .text @@ -699,20 +699,20 @@ L_AES_Thumb2_rcon: .globl AES_set_encrypt_key .type AES_set_encrypt_key, %function AES_set_encrypt_key: - PUSH {r4, r5, r6, r7, r8, lr} - LDR r8, L_AES_Thumb2_te + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + LDR r10, L_AES_Thumb2_te ADR lr, L_AES_Thumb2_rcon CMP r1, #0x80 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_set_encrypt_key_start_128 #else - BEQ.N L_AES_set_encrypt_key_start_128 + BEQ.W L_AES_set_encrypt_key_start_128 #endif CMP r1, #0xc0 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_set_encrypt_key_start_192 #else - BEQ.N L_AES_set_encrypt_key_start_192 + BEQ.W L_AES_set_encrypt_key_start_192 #endif LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] @@ -735,10 +735,10 @@ L_AES_set_encrypt_key_loop_256: UBFX r5, r7, #8, #8 UBFX r6, r7, #16, #8 LSR r7, r7, #24 - LDRB r4, [r8, r4, LSL #2] - LDRB r5, [r8, r5, LSL #2] - LDRB r6, [r8, r6, LSL #2] - LDRB r7, [r8, r7, LSL #2] + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] EOR r3, r7, r4, LSL #8 EOR r3, r3, r5, LSL #16 EOR r3, r3, r6, LSL #24 @@ -757,10 +757,10 @@ L_AES_set_encrypt_key_loop_256: UBFX r5, r3, #16, #8 LSR r6, r3, #24 UBFX r3, r3, #0, #8 - LDRB r4, [r8, r4, LSL #2] - LDRB r6, [r8, r6, LSL #2] - LDRB r5, [r8, r5, LSL #2] - LDRB r3, [r8, r3, LSL #2] + LDRB r4, [r10, r4, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r3, [r10, r3, LSL #2] EOR r3, r3, r4, LSL #8 EOR r3, r3, r5, LSL #16 EOR r3, r3, r6, LSL #24 @@ -782,10 +782,10 @@ L_AES_set_encrypt_key_loop_256: UBFX r5, r7, #8, #8 UBFX r6, r7, #16, #8 LSR r7, r7, #24 - LDRB r4, [r8, r4, LSL #2] - LDRB r5, [r8, r5, LSL #2] - LDRB r6, [r8, r6, LSL #2] - LDRB r7, [r8, r7, LSL #2] + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] EOR r3, r7, r4, LSL #8 EOR r3, r3, r5, LSL #16 EOR r3, r3, r6, LSL #24 @@ -799,69 +799,77 @@ L_AES_set_encrypt_key_loop_256: ADD r2, r2, #0x10 STM r2, {r4, r5, r6, r7} SUB r2, r2, #0x10 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif L_AES_set_encrypt_key_start_192: LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] - LDRD r0, r1, [r0, #16] + LDRD r8, r9, [r0, #16] REV r4, r4 REV r5, r5 REV r6, r6 REV r7, r7 - REV r0, r0 - REV r1, r1 + REV r8, r8 + REV r9, r9 STM r2, {r4, r5, r6, r7} - STRD r0, r1, [r2, #16] - MOV r7, r1 + STRD r8, r9, [r2, #16] + MOV r7, r9 MOV r12, #0x7 L_AES_set_encrypt_key_loop_192: - UBFX r0, r7, #0, #8 - UBFX r1, r7, #8, #8 - UBFX r4, r7, #16, #8 - LSR r7, r7, #24 - LDRB r0, [r8, r0, LSL #2] - LDRB r1, [r8, r1, LSL #2] - LDRB r4, [r8, r4, LSL #2] - LDRB r7, [r8, r7, LSL #2] - EOR r3, r7, r0, LSL #8 - EOR r3, r3, r1, LSL #16 - EOR r3, r3, r4, LSL #24 - LDM r2!, {r0, r1, r4, r5, r6, r7} - EOR r0, r0, r3 + UBFX r4, r9, #0, #8 + UBFX r5, r9, #8, #8 + UBFX r6, r9, #16, #8 + LSR r9, r9, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r9, [r10, r9, LSL #2] + EOR r3, r9, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7, r8, r9} + EOR r4, r4, r3 LDM lr!, {r3} - EOR r0, r0, r3 - EOR r1, r1, r0 - EOR r4, r4, r1 + EOR r4, r4, r3 EOR r5, r5, r4 EOR r6, r6, r5 EOR r7, r7, r6 - STM r2, {r0, r1, r4, r5, r6, r7} + EOR r8, r8, r7 + EOR r9, r9, r8 + STM r2, {r4, r5, r6, r7, r8, r9} SUBS r12, r12, #0x1 #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) BNE L_AES_set_encrypt_key_loop_192 #else BNE.N L_AES_set_encrypt_key_loop_192 #endif - UBFX r0, r7, #0, #8 - UBFX r1, r7, #8, #8 - UBFX r4, r7, #16, #8 - LSR r7, r7, #24 - LDRB r0, [r8, r0, LSL #2] - LDRB r1, [r8, r1, LSL #2] - LDRB r4, [r8, r4, LSL #2] - LDRB r7, [r8, r7, LSL #2] - EOR r3, r7, r0, LSL #8 - EOR r3, r3, r1, LSL #16 - EOR r3, r3, r4, LSL #24 - LDM r2!, {r0, r1, r4, r5, r6, r7} - EOR r0, r0, r3 + UBFX r4, r9, #0, #8 + UBFX r5, r9, #8, #8 + UBFX r6, r9, #16, #8 + LSR r9, r9, #24 + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r9, [r10, r9, LSL #2] + EOR r3, r9, r4, LSL #8 + EOR r3, r3, r5, LSL #16 + EOR r3, r3, r6, LSL #24 + LDM r2!, {r4, r5, r6, r7, r8, r9} + EOR r4, r4, r3 LDM lr!, {r3} - EOR r0, r0, r3 - EOR r1, r1, r0 - EOR r4, r4, r1 + EOR r4, r4, r3 EOR r5, r5, r4 - STM r2, {r0, r1, r4, r5} + EOR r6, r6, r5 + EOR r7, r7, r6 + STM r2, {r4, r5, r6, r7} +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_set_encrypt_key_end +#else + B.N L_AES_set_encrypt_key_end +#endif L_AES_set_encrypt_key_start_128: LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] @@ -876,10 +884,10 @@ L_AES_set_encrypt_key_loop_128: UBFX r5, r7, #8, #8 UBFX r6, r7, #16, #8 LSR r7, r7, #24 - LDRB r4, [r8, r4, LSL #2] - LDRB r5, [r8, r5, LSL #2] - LDRB r6, [r8, r6, LSL #2] - LDRB r7, [r8, r7, LSL #2] + LDRB r4, [r10, r4, LSL #2] + LDRB r5, [r10, r5, LSL #2] + LDRB r6, [r10, r6, LSL #2] + LDRB r7, [r10, r7, LSL #2] EOR r3, r7, r4, LSL #8 EOR r3, r3, r5, LSL #16 EOR r3, r3, r6, LSL #24 @@ -898,8 +906,8 @@ L_AES_set_encrypt_key_loop_128: BNE.N L_AES_set_encrypt_key_loop_128 #endif L_AES_set_encrypt_key_end: - POP {r4, r5, r6, r7, r8, pc} - # Cycle Count = 327 + POP {r4, r5, r6, r7, r8, r9, r10, pc} + /* Cycle Count = 331 */ .size AES_set_encrypt_key,.-AES_set_encrypt_key .text .align 4 @@ -953,7 +961,7 @@ L_AES_encrypt_block_nr: LDM r3!, {r4, r5, r6, r7} EOR r11, r11, lr, ROR #24 EOR r11, r11, r2, ROR #8 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r8, r8, r4 EOR r9, r9, r5 EOR r10, r10, r6 @@ -1003,16 +1011,16 @@ L_AES_encrypt_block_nr: LDM r3!, {r8, r9, r10, r11} EOR r7, r7, lr, ROR #24 EOR r7, r7, r2, ROR #8 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_encrypt_block_nr #else - BNE.N L_AES_encrypt_block_nr + BNE.W L_AES_encrypt_block_nr #endif UBFX r8, r5, #16, #8 LSR r11, r4, #24 @@ -1059,7 +1067,7 @@ L_AES_encrypt_block_nr: LDM r3!, {r4, r5, r6, r7} EOR r11, r11, lr, ROR #24 EOR r11, r11, r2, ROR #8 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r8, r8, r4 EOR r9, r9, r5 EOR r10, r10, r6 @@ -1109,13 +1117,13 @@ L_AES_encrypt_block_nr: LDM r3, {r8, r9, r10, r11} EOR r7, r7, lr, LSL #8 EOR r7, r7, r2, LSL #16 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 EOR r7, r7, r11 POP {pc} - # Cycle Count = 285 + /* Cycle Count = 285 */ .size AES_encrypt_block,.-AES_encrypt_block #if defined(HAVE_AES_CBC) || defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) .text @@ -1137,16 +1145,16 @@ AES_ECB_encrypt: LDR r12, [sp, #36] PUSH {r3} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_encrypt_start_block_128 #else - BEQ.N L_AES_ECB_encrypt_start_block_128 + BEQ.W L_AES_ECB_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_encrypt_start_block_192 #else - BEQ.N L_AES_ECB_encrypt_start_block_192 + BEQ.W L_AES_ECB_encrypt_start_block_192 #endif L_AES_ECB_encrypt_loop_block_256: LDR r4, [lr] @@ -1159,7 +1167,7 @@ L_AES_ECB_encrypt_loop_block_256: REV r7, r7 PUSH {r1, r2, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1179,12 +1187,16 @@ L_AES_ECB_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_256 #else - BNE.N L_AES_ECB_encrypt_loop_block_256 + BNE.W L_AES_ECB_encrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif L_AES_ECB_encrypt_start_block_192: L_AES_ECB_encrypt_loop_block_192: LDR r4, [lr] @@ -1197,7 +1209,7 @@ L_AES_ECB_encrypt_loop_block_192: REV r7, r7 PUSH {r1, r2, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1217,12 +1229,16 @@ L_AES_ECB_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_192 #else - BNE.N L_AES_ECB_encrypt_loop_block_192 + BNE.W L_AES_ECB_encrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_encrypt_end +#else + B.N L_AES_ECB_encrypt_end +#endif L_AES_ECB_encrypt_start_block_128: L_AES_ECB_encrypt_loop_block_128: LDR r4, [lr] @@ -1235,7 +1251,7 @@ L_AES_ECB_encrypt_loop_block_128: REV r7, r7 PUSH {r1, r2, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1255,15 +1271,15 @@ L_AES_ECB_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_encrypt_loop_block_128 #else - BNE.N L_AES_ECB_encrypt_loop_block_128 + BNE.W L_AES_ECB_encrypt_loop_block_128 #endif L_AES_ECB_encrypt_end: POP {r3} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 212 + /* Cycle Count = 212 */ .size AES_ECB_encrypt,.-AES_ECB_encrypt #endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC @@ -1280,16 +1296,16 @@ AES_CBC_encrypt: LDM r9, {r4, r5, r6, r7} PUSH {r3, r9} CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_encrypt_start_block_128 #else - BEQ.N L_AES_CBC_encrypt_start_block_128 + BEQ.W L_AES_CBC_encrypt_start_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_encrypt_start_block_192 #else - BEQ.N L_AES_CBC_encrypt_start_block_192 + BEQ.W L_AES_CBC_encrypt_start_block_192 #endif L_AES_CBC_encrypt_loop_block_256: LDR r8, [lr] @@ -1306,7 +1322,7 @@ L_AES_CBC_encrypt_loop_block_256: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1326,12 +1342,16 @@ L_AES_CBC_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_256 #else - BNE.N L_AES_CBC_encrypt_loop_block_256 + BNE.W L_AES_CBC_encrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif L_AES_CBC_encrypt_start_block_192: L_AES_CBC_encrypt_loop_block_192: LDR r8, [lr] @@ -1348,7 +1368,7 @@ L_AES_CBC_encrypt_loop_block_192: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1368,12 +1388,16 @@ L_AES_CBC_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_192 #else - BNE.N L_AES_CBC_encrypt_loop_block_192 + BNE.W L_AES_CBC_encrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_encrypt_end +#else + B.N L_AES_CBC_encrypt_end +#endif L_AES_CBC_encrypt_start_block_128: L_AES_CBC_encrypt_loop_block_128: LDR r8, [lr] @@ -1390,7 +1414,7 @@ L_AES_CBC_encrypt_loop_block_128: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1410,16 +1434,16 @@ L_AES_CBC_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_encrypt_loop_block_128 #else - BNE.N L_AES_CBC_encrypt_loop_block_128 + BNE.W L_AES_CBC_encrypt_loop_block_128 #endif L_AES_CBC_encrypt_end: POP {r3, r9} STM r9, {r4, r5, r6, r7} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 238 + /* Cycle Count = 238 */ .size AES_CBC_encrypt,.-AES_CBC_encrypt #endif /* HAVE_AES_CBC */ #ifdef WOLFSSL_AES_COUNTER @@ -1441,16 +1465,16 @@ AES_CTR_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CTR_encrypt_start_block_128 #else - BEQ.N L_AES_CTR_encrypt_start_block_128 + BEQ.W L_AES_CTR_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CTR_encrypt_start_block_192 #else - BEQ.N L_AES_CTR_encrypt_start_block_192 + BEQ.W L_AES_CTR_encrypt_start_block_192 #endif L_AES_CTR_encrypt_loop_block_256: PUSH {r1, r2, lr} @@ -1461,7 +1485,7 @@ L_AES_CTR_encrypt_loop_block_256: ADC r8, r4, #0x0 STM lr, {r8, r9, r10, r11} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1491,12 +1515,16 @@ L_AES_CTR_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_256 #else - BNE.N L_AES_CTR_encrypt_loop_block_256 + BNE.W L_AES_CTR_encrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif L_AES_CTR_encrypt_start_block_192: L_AES_CTR_encrypt_loop_block_192: PUSH {r1, r2, lr} @@ -1507,7 +1535,7 @@ L_AES_CTR_encrypt_loop_block_192: ADC r8, r4, #0x0 STM lr, {r8, r9, r10, r11} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1537,12 +1565,16 @@ L_AES_CTR_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_192 #else - BNE.N L_AES_CTR_encrypt_loop_block_192 + BNE.W L_AES_CTR_encrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_CTR_encrypt_end +#else + B.W L_AES_CTR_encrypt_end +#endif L_AES_CTR_encrypt_start_block_128: L_AES_CTR_encrypt_loop_block_128: PUSH {r1, r2, lr} @@ -1553,7 +1585,7 @@ L_AES_CTR_encrypt_loop_block_128: ADC r8, r4, #0x0 STM lr, {r8, r9, r10, r11} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -1583,10 +1615,10 @@ L_AES_CTR_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CTR_encrypt_loop_block_128 #else - BNE.N L_AES_CTR_encrypt_loop_block_128 + BNE.W L_AES_CTR_encrypt_loop_block_128 #endif L_AES_CTR_encrypt_end: POP {r3, r8} @@ -1596,7 +1628,7 @@ L_AES_CTR_encrypt_end: REV r7, r7 STM r8, {r4, r5, r6, r7} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 293 + /* Cycle Count = 293 */ .size AES_CTR_encrypt,.-AES_CTR_encrypt #endif /* WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_DECRYPT @@ -1653,7 +1685,7 @@ L_AES_decrypt_block_nr: LDM r3!, {r4, r5, r6, r7} EOR r11, r11, lr, ROR #8 EOR r11, r11, r12, ROR #24 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r8, r8, r4 EOR r9, r9, r5 EOR r10, r10, r6 @@ -1703,16 +1735,16 @@ L_AES_decrypt_block_nr: LDM r3!, {r8, r9, r10, r11} EOR r7, r7, lr, ROR #8 EOR r7, r7, r12, ROR #24 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 EOR r7, r7, r11 SUBS r1, r1, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_decrypt_block_nr #else - BNE.N L_AES_decrypt_block_nr + BNE.W L_AES_decrypt_block_nr #endif UBFX r8, r7, #16, #8 LSR r11, r4, #24 @@ -1759,7 +1791,7 @@ L_AES_decrypt_block_nr: LDM r3!, {r4, r5, r6, r7} EOR r11, r11, lr, ROR #8 EOR r11, r11, r12, ROR #24 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r8, r8, r4 EOR r9, r9, r5 EOR r10, r10, r6 @@ -1809,13 +1841,13 @@ L_AES_decrypt_block_nr: LDM r3, {r8, r9, r10, r11} EOR r7, r7, r12, LSL #8 EOR r7, r7, lr, LSL #16 - # XOR in Key Schedule + /* XOR in Key Schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 EOR r7, r7, r11 POP {pc} - # Cycle Count = 285 + /* Cycle Count = 285 */ .size AES_decrypt_block,.-AES_decrypt_block .text .type L_AES_Thumb2_td_ecb, %object @@ -2097,16 +2129,16 @@ AES_ECB_decrypt: MOV r12, r2 ADR r2, L_AES_Thumb2_td4 CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_decrypt_start_block_128 #else - BEQ.N L_AES_ECB_decrypt_start_block_128 + BEQ.W L_AES_ECB_decrypt_start_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_ECB_decrypt_start_block_192 #else - BEQ.N L_AES_ECB_decrypt_start_block_192 + BEQ.W L_AES_ECB_decrypt_start_block_192 #endif L_AES_ECB_decrypt_loop_block_256: LDR r4, [lr] @@ -2119,7 +2151,7 @@ L_AES_ECB_decrypt_loop_block_256: REV r7, r7 PUSH {r1, r3, r12, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2138,12 +2170,16 @@ L_AES_ECB_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_256 #else - BNE.N L_AES_ECB_decrypt_loop_block_256 + BNE.W L_AES_ECB_decrypt_loop_block_256 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif L_AES_ECB_decrypt_start_block_192: L_AES_ECB_decrypt_loop_block_192: LDR r4, [lr] @@ -2156,7 +2192,7 @@ L_AES_ECB_decrypt_loop_block_192: REV r7, r7 PUSH {r1, r3, r12, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2175,12 +2211,16 @@ L_AES_ECB_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_192 #else - BNE.N L_AES_ECB_decrypt_loop_block_192 + BNE.W L_AES_ECB_decrypt_loop_block_192 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_ECB_decrypt_end +#else + B.N L_AES_ECB_decrypt_end +#endif L_AES_ECB_decrypt_start_block_128: L_AES_ECB_decrypt_loop_block_128: LDR r4, [lr] @@ -2193,7 +2233,7 @@ L_AES_ECB_decrypt_loop_block_128: REV r7, r7 PUSH {r1, r3, r12, lr} LDM r3!, {r8, r9, r10, r11} - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2212,14 +2252,14 @@ L_AES_ECB_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_ECB_decrypt_loop_block_128 #else - BNE.N L_AES_ECB_decrypt_loop_block_128 + BNE.W L_AES_ECB_decrypt_loop_block_128 #endif L_AES_ECB_decrypt_end: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 210 + /* Cycle Count = 210 */ .size AES_ECB_decrypt,.-AES_ECB_decrypt #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ #ifdef HAVE_AES_CBC @@ -2237,16 +2277,16 @@ AES_CBC_decrypt: ADR r2, L_AES_Thumb2_td4 PUSH {r3, r4} CMP r8, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_loop_block_128 #else - BEQ.N L_AES_CBC_decrypt_loop_block_128 + BEQ.W L_AES_CBC_decrypt_loop_block_128 #endif CMP r8, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_loop_block_192 #else - BEQ.N L_AES_CBC_decrypt_loop_block_192 + BEQ.W L_AES_CBC_decrypt_loop_block_192 #endif L_AES_CBC_decrypt_loop_block_256: PUSH {r1, r12, lr} @@ -2262,7 +2302,7 @@ L_AES_CBC_decrypt_loop_block_256: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2288,10 +2328,10 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2306,7 +2346,7 @@ L_AES_CBC_decrypt_loop_block_256: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2333,12 +2373,16 @@ L_AES_CBC_decrypt_loop_block_256: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_256 #else - BNE.N L_AES_CBC_decrypt_loop_block_256 + BNE.W L_AES_CBC_decrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_loop_block_192: PUSH {r1, r12, lr} LDR r4, [lr] @@ -2353,7 +2397,7 @@ L_AES_CBC_decrypt_loop_block_192: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2379,10 +2423,10 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2397,7 +2441,7 @@ L_AES_CBC_decrypt_loop_block_192: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2424,12 +2468,16 @@ L_AES_CBC_decrypt_loop_block_192: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_192 #else - BNE.N L_AES_CBC_decrypt_loop_block_192 + BNE.W L_AES_CBC_decrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_CBC_decrypt_end +#else + B.W L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_loop_block_128: PUSH {r1, r12, lr} LDR r4, [lr] @@ -2444,7 +2492,7 @@ L_AES_CBC_decrypt_loop_block_128: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2470,10 +2518,10 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_CBC_decrypt_end_odd #else - BEQ.N L_AES_CBC_decrypt_end_odd + BEQ.W L_AES_CBC_decrypt_end_odd #endif PUSH {r1, r12, lr} LDR r4, [lr] @@ -2488,7 +2536,7 @@ L_AES_CBC_decrypt_loop_block_128: REV r5, r5 REV r6, r6 REV r7, r7 - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -2515,12 +2563,16 @@ L_AES_CBC_decrypt_loop_block_128: SUBS r12, r12, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_CBC_decrypt_loop_block_128 #else - BNE.N L_AES_CBC_decrypt_loop_block_128 + BNE.W L_AES_CBC_decrypt_loop_block_128 #endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) B L_AES_CBC_decrypt_end +#else + B.N L_AES_CBC_decrypt_end +#endif L_AES_CBC_decrypt_end_odd: LDR r4, [sp, #4] LDRD r8, r9, [r4, #16] @@ -2530,7 +2582,7 @@ L_AES_CBC_decrypt_end_odd: L_AES_CBC_decrypt_end: POP {r3, r4} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 518 + /* Cycle Count = 518 */ .size AES_CBC_decrypt,.-AES_CBC_decrypt #endif /* HAVE_AES_CBC */ #endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC */ @@ -3109,13 +3161,13 @@ L_GCM_gmult_len_start_block: POP {r3} SUBS r3, r3, #0x10 ADD r2, r2, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_GCM_gmult_len_start_block #else - BNE.N L_GCM_gmult_len_start_block + BNE.W L_GCM_gmult_len_start_block #endif POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 742 + /* Cycle Count = 742 */ .size GCM_gmult_len,.-GCM_gmult_len .text .type L_AES_Thumb2_te_gcm, %object @@ -3141,16 +3193,16 @@ AES_GCM_encrypt: STM r8, {r4, r5, r6, r7} PUSH {r3, r8} CMP r12, #0xa -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_GCM_encrypt_start_block_128 #else - BEQ.N L_AES_GCM_encrypt_start_block_128 + BEQ.W L_AES_GCM_encrypt_start_block_128 #endif CMP r12, #0xc -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BEQ L_AES_GCM_encrypt_start_block_192 #else - BEQ.N L_AES_GCM_encrypt_start_block_192 + BEQ.W L_AES_GCM_encrypt_start_block_192 #endif L_AES_GCM_encrypt_loop_block_256: PUSH {r1, r2, lr} @@ -3158,7 +3210,7 @@ L_AES_GCM_encrypt_loop_block_256: ADD r7, r7, #0x1 LDM r3!, {r8, r9, r10, r11} STR r7, [lr, #12] - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -3188,12 +3240,16 @@ L_AES_GCM_encrypt_loop_block_256: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_256 #else - BNE.N L_AES_GCM_encrypt_loop_block_256 + BNE.W L_AES_GCM_encrypt_loop_block_256 #endif +#ifdef __GNUC__ B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif L_AES_GCM_encrypt_start_block_192: L_AES_GCM_encrypt_loop_block_192: PUSH {r1, r2, lr} @@ -3201,7 +3257,7 @@ L_AES_GCM_encrypt_loop_block_192: ADD r7, r7, #0x1 LDM r3!, {r8, r9, r10, r11} STR r7, [lr, #12] - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -3231,12 +3287,16 @@ L_AES_GCM_encrypt_loop_block_192: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_192 #else - BNE.N L_AES_GCM_encrypt_loop_block_192 + BNE.W L_AES_GCM_encrypt_loop_block_192 #endif +#ifdef __GNUC__ B L_AES_GCM_encrypt_end +#else + B.W L_AES_GCM_encrypt_end +#endif L_AES_GCM_encrypt_start_block_128: L_AES_GCM_encrypt_loop_block_128: PUSH {r1, r2, lr} @@ -3244,7 +3304,7 @@ L_AES_GCM_encrypt_loop_block_128: ADD r7, r7, #0x1 LDM r3!, {r8, r9, r10, r11} STR r7, [lr, #12] - # Round: 0 - XOR in key schedule + /* Round: 0 - XOR in key schedule */ EOR r4, r4, r8 EOR r5, r5, r9 EOR r6, r6, r10 @@ -3274,10 +3334,10 @@ L_AES_GCM_encrypt_loop_block_128: SUBS r2, r2, #0x10 ADD lr, lr, #0x10 ADD r1, r1, #0x10 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_AES_GCM_encrypt_loop_block_128 #else - BNE.N L_AES_GCM_encrypt_loop_block_128 + BNE.W L_AES_GCM_encrypt_loop_block_128 #endif L_AES_GCM_encrypt_end: POP {r3, r8} @@ -3287,7 +3347,7 @@ L_AES_GCM_encrypt_end: REV r7, r7 STM r8, {r4, r5, r6, r7} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 275 + /* Cycle Count = 275 */ .size AES_GCM_encrypt,.-AES_GCM_encrypt #endif /* HAVE_AESGCM */ #endif /* !NO_AES */ diff --git a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c index 48b5edc16..7d5357f1a 100644 --- a/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-aes-asm_c.c @@ -39,7 +39,7 @@ #ifdef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__arm__) +#if !defined(__aarch64__) && defined(__thumb__) #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm @@ -208,9 +208,9 @@ void AES_invert_key(unsigned char* ks, word32 rounds) #ifndef WOLFSSL_NO_VAR_ASSIGN_REG register unsigned char* ks __asm__ ("r0") = (unsigned char*)ks_p; register word32 rounds __asm__ ("r1") = (word32)rounds_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_c __asm__ ("r2") = (uint32_t*)L_AES_Thumb2_te; register uint32_t* L_AES_Thumb2_td_c __asm__ ("r3") = (uint32_t*)L_AES_Thumb2_td; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( "MOV r12, %[L_AES_Thumb2_te]\n\t" @@ -218,7 +218,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds) "ADD r10, %[ks], %[rounds], LSL #4\n\t" "MOV r11, %[rounds]\n\t" "\n" - "L_AES_invert_key_loop_%=:\n\t" + "L_AES_invert_key_loop:\n\t" "LDM %[ks], {r2, r3, r4, r5}\n\t" "LDM r10, {r6, r7, r8, r9}\n\t" "STM r10, {r2, r3, r4, r5}\n\t" @@ -226,15 +226,15 @@ void AES_invert_key(unsigned char* ks, word32 rounds) "SUBS r11, r11, #0x2\n\t" "SUB r10, r10, #0x10\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_invert_key_loop_%=\n\t" + "BNE L_AES_invert_key_loop\n\t" #else - "BNE.N L_AES_invert_key_loop_%=\n\t" + "BNE.N L_AES_invert_key_loop\n\t" #endif "SUB %[ks], %[ks], %[rounds], LSL #3\n\t" "ADD %[ks], %[ks], #0x10\n\t" "SUB r11, %[rounds], #0x1\n\t" "\n" - "L_AES_invert_key_mix_loop_%=:\n\t" + "L_AES_invert_key_mix_loop:\n\t" "LDM %[ks], {r2, r3, r4, r5}\n\t" "UBFX r6, r2, #0, #8\n\t" "UBFX r7, r2, #8, #8\n\t" @@ -301,13 +301,19 @@ void AES_invert_key(unsigned char* ks, word32 rounds) "EOR r8, r8, r9, ROR #24\n\t" "STR r8, [%[ks]], #4\n\t" "SUBS r11, r11, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_invert_key_mix_loop_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_invert_key_mix_loop\n\t" #else - "BNE.N L_AES_invert_key_mix_loop_%=\n\t" + "BNE.W L_AES_invert_key_mix_loop\n\t" #endif - : [ks] "+r" (ks), [rounds] "+r" (rounds), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [ks] "+r" (ks), [rounds] "+r" (rounds), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_td] "+r" (L_AES_Thumb2_td_c) : +#else + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_td] "r" (L_AES_Thumb2_td) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -316,7 +322,7 @@ void AES_invert_key(unsigned char* ks, word32 rounds) static const uint32_t L_AES_Thumb2_rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1b000000, 0x36000000, + 0x1b000000, 0x36000000 }; void AES_set_encrypt_key(const unsigned char* key, word32 len, @@ -331,24 +337,24 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks register const unsigned char* key __asm__ ("r0") = (const unsigned char*)key_p; register word32 len __asm__ ("r1") = (word32)len_p; register unsigned char* ks __asm__ ("r2") = (unsigned char*)ks_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_c __asm__ ("r3") = (uint32_t*)L_AES_Thumb2_te; register uint32_t* L_AES_Thumb2_rcon_c __asm__ ("r4") = (uint32_t*)&L_AES_Thumb2_rcon; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( - "MOV r8, %[L_AES_Thumb2_te]\n\t" + "MOV r10, %[L_AES_Thumb2_te]\n\t" "MOV lr, %[L_AES_Thumb2_rcon]\n\t" "CMP %[len], #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_set_encrypt_key_start_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_set_encrypt_key_start_128\n\t" #else - "BEQ.N L_AES_set_encrypt_key_start_128_%=\n\t" + "BEQ.W L_AES_set_encrypt_key_start_128\n\t" #endif "CMP %[len], #0xc0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_set_encrypt_key_start_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_set_encrypt_key_start_192\n\t" #else - "BEQ.N L_AES_set_encrypt_key_start_192_%=\n\t" + "BEQ.W L_AES_set_encrypt_key_start_192\n\t" #endif "LDRD r4, r5, [%[key]]\n\t" "LDRD r6, r7, [%[key], #8]\n\t" @@ -367,15 +373,15 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "SUB %[ks], %[ks], #0x10\n\t" "MOV r12, #0x6\n\t" "\n" - "L_AES_set_encrypt_key_loop_256_%=:\n\t" + "L_AES_set_encrypt_key_loop_256:\n\t" "UBFX r4, r7, #0, #8\n\t" "UBFX r5, r7, #8, #8\n\t" "UBFX r6, r7, #16, #8\n\t" "LSR r7, r7, #24\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r5, [r8, r5, LSL #2]\n\t" - "LDRB r6, [r8, r6, LSL #2]\n\t" - "LDRB r7, [r8, r7, LSL #2]\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" "EOR r3, r7, r4, LSL #8\n\t" "EOR r3, r3, r5, LSL #16\n\t" "EOR r3, r3, r6, LSL #24\n\t" @@ -394,10 +400,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "UBFX r5, r3, #16, #8\n\t" "LSR r6, r3, #24\n\t" "UBFX r3, r3, #0, #8\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r6, [r8, r6, LSL #2]\n\t" - "LDRB r5, [r8, r5, LSL #2]\n\t" - "LDRB r3, [r8, r3, LSL #2]\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r3, [r10, r3, LSL #2]\n\t" "EOR r3, r3, r4, LSL #8\n\t" "EOR r3, r3, r5, LSL #16\n\t" "EOR r3, r3, r6, LSL #24\n\t" @@ -411,18 +417,18 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "SUB %[ks], %[ks], #0x10\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_set_encrypt_key_loop_256_%=\n\t" + "BNE L_AES_set_encrypt_key_loop_256\n\t" #else - "BNE.N L_AES_set_encrypt_key_loop_256_%=\n\t" + "BNE.N L_AES_set_encrypt_key_loop_256\n\t" #endif "UBFX r4, r7, #0, #8\n\t" "UBFX r5, r7, #8, #8\n\t" "UBFX r6, r7, #16, #8\n\t" "LSR r7, r7, #24\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r5, [r8, r5, LSL #2]\n\t" - "LDRB r6, [r8, r6, LSL #2]\n\t" - "LDRB r7, [r8, r7, LSL #2]\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" "EOR r3, r7, r4, LSL #8\n\t" "EOR r3, r3, r5, LSL #16\n\t" "EOR r3, r3, r6, LSL #24\n\t" @@ -436,73 +442,81 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "ADD %[ks], %[ks], #0x10\n\t" "STM %[ks], {r4, r5, r6, r7}\n\t" "SUB %[ks], %[ks], #0x10\n\t" - "B L_AES_set_encrypt_key_end_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end\n\t" +#endif "\n" - "L_AES_set_encrypt_key_start_192_%=:\n\t" + "L_AES_set_encrypt_key_start_192:\n\t" "LDRD r4, r5, [%[key]]\n\t" "LDRD r6, r7, [%[key], #8]\n\t" - "LDRD %[key], %[len], [%[key], #16]\n\t" + "LDRD r8, r9, [%[key], #16]\n\t" "REV r4, r4\n\t" "REV r5, r5\n\t" "REV r6, r6\n\t" "REV r7, r7\n\t" - "REV %[key], %[key]\n\t" - "REV %[len], %[len]\n\t" + "REV r8, r8\n\t" + "REV r9, r9\n\t" "STM %[ks], {r4, r5, r6, r7}\n\t" - "STRD %[key], %[len], [%[ks], #16]\n\t" - "MOV r7, %[len]\n\t" + "STRD r8, r9, [%[ks], #16]\n\t" + "MOV r7, r9\n\t" "MOV r12, #0x7\n\t" "\n" - "L_AES_set_encrypt_key_loop_192_%=:\n\t" - "UBFX r0, r7, #0, #8\n\t" - "UBFX r1, r7, #8, #8\n\t" - "UBFX r4, r7, #16, #8\n\t" - "LSR r7, r7, #24\n\t" - "LDRB r0, [r8, r0, LSL #2]\n\t" - "LDRB r1, [r8, r1, LSL #2]\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r7, [r8, r7, LSL #2]\n\t" - "EOR r3, r7, r0, LSL #8\n\t" - "EOR r3, r3, r1, LSL #16\n\t" - "EOR r3, r3, r4, LSL #24\n\t" - "LDM %[ks]!, {r0, r1, r4, r5, r6, r7}\n\t" - "EOR r0, r0, r3\n\t" + "L_AES_set_encrypt_key_loop_192:\n\t" + "UBFX r4, r9, #0, #8\n\t" + "UBFX r5, r9, #8, #8\n\t" + "UBFX r6, r9, #16, #8\n\t" + "LSR r9, r9, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r9, [r10, r9, LSL #2]\n\t" + "EOR r3, r9, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7, r8, r9}\n\t" + "EOR r4, r4, r3\n\t" "LDM lr!, {r3}\n\t" - "EOR r0, r0, r3\n\t" - "EOR r1, r1, r0\n\t" - "EOR r4, r4, r1\n\t" + "EOR r4, r4, r3\n\t" "EOR r5, r5, r4\n\t" "EOR r6, r6, r5\n\t" "EOR r7, r7, r6\n\t" - "STM %[ks], {r0, r1, r4, r5, r6, r7}\n\t" + "EOR r8, r8, r7\n\t" + "EOR r9, r9, r8\n\t" + "STM %[ks], {r4, r5, r6, r7, r8, r9}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_set_encrypt_key_loop_192_%=\n\t" + "BNE L_AES_set_encrypt_key_loop_192\n\t" #else - "BNE.N L_AES_set_encrypt_key_loop_192_%=\n\t" + "BNE.N L_AES_set_encrypt_key_loop_192\n\t" #endif - "UBFX r0, r7, #0, #8\n\t" - "UBFX r1, r7, #8, #8\n\t" - "UBFX r4, r7, #16, #8\n\t" - "LSR r7, r7, #24\n\t" - "LDRB r0, [r8, r0, LSL #2]\n\t" - "LDRB r1, [r8, r1, LSL #2]\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r7, [r8, r7, LSL #2]\n\t" - "EOR r3, r7, r0, LSL #8\n\t" - "EOR r3, r3, r1, LSL #16\n\t" - "EOR r3, r3, r4, LSL #24\n\t" - "LDM %[ks]!, {r0, r1, r4, r5, r6, r7}\n\t" - "EOR r0, r0, r3\n\t" + "UBFX r4, r9, #0, #8\n\t" + "UBFX r5, r9, #8, #8\n\t" + "UBFX r6, r9, #16, #8\n\t" + "LSR r9, r9, #24\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r9, [r10, r9, LSL #2]\n\t" + "EOR r3, r9, r4, LSL #8\n\t" + "EOR r3, r3, r5, LSL #16\n\t" + "EOR r3, r3, r6, LSL #24\n\t" + "LDM %[ks]!, {r4, r5, r6, r7, r8, r9}\n\t" + "EOR r4, r4, r3\n\t" "LDM lr!, {r3}\n\t" - "EOR r0, r0, r3\n\t" - "EOR r1, r1, r0\n\t" - "EOR r4, r4, r1\n\t" + "EOR r4, r4, r3\n\t" "EOR r5, r5, r4\n\t" - "STM %[ks], {r0, r1, r4, r5}\n\t" - "B L_AES_set_encrypt_key_end_%=\n\t" + "EOR r6, r6, r5\n\t" + "EOR r7, r7, r6\n\t" + "STM %[ks], {r4, r5, r6, r7}\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_set_encrypt_key_end\n\t" +#else + "B.N L_AES_set_encrypt_key_end\n\t" +#endif "\n" - "L_AES_set_encrypt_key_start_128_%=:\n\t" + "L_AES_set_encrypt_key_start_128:\n\t" "LDRD r4, r5, [%[key]]\n\t" "LDRD r6, r7, [%[key], #8]\n\t" "REV r4, r4\n\t" @@ -512,15 +526,15 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "STM %[ks], {r4, r5, r6, r7}\n\t" "MOV r12, #0xa\n\t" "\n" - "L_AES_set_encrypt_key_loop_128_%=:\n\t" + "L_AES_set_encrypt_key_loop_128:\n\t" "UBFX r4, r7, #0, #8\n\t" "UBFX r5, r7, #8, #8\n\t" "UBFX r6, r7, #16, #8\n\t" "LSR r7, r7, #24\n\t" - "LDRB r4, [r8, r4, LSL #2]\n\t" - "LDRB r5, [r8, r5, LSL #2]\n\t" - "LDRB r6, [r8, r6, LSL #2]\n\t" - "LDRB r7, [r8, r7, LSL #2]\n\t" + "LDRB r4, [r10, r4, LSL #2]\n\t" + "LDRB r5, [r10, r5, LSL #2]\n\t" + "LDRB r6, [r10, r6, LSL #2]\n\t" + "LDRB r7, [r10, r7, LSL #2]\n\t" "EOR r3, r7, r4, LSL #8\n\t" "EOR r3, r3, r5, LSL #16\n\t" "EOR r3, r3, r6, LSL #24\n\t" @@ -534,15 +548,21 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len, unsigned char* ks "STM %[ks], {r4, r5, r6, r7}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_set_encrypt_key_loop_128_%=\n\t" + "BNE L_AES_set_encrypt_key_loop_128\n\t" #else - "BNE.N L_AES_set_encrypt_key_loop_128_%=\n\t" + "BNE.N L_AES_set_encrypt_key_loop_128\n\t" #endif "\n" - "L_AES_set_encrypt_key_end_%=:\n\t" - : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) + "L_AES_set_encrypt_key_end:\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks), + [L_AES_Thumb2_te] "+r" (L_AES_Thumb2_te_c), [L_AES_Thumb2_rcon] "+r" (L_AES_Thumb2_rcon_c) : - : "memory", "r12", "lr", "r5", "r6", "r7", "r8" +#else + : [key] "+r" (key), [len] "+r" (len), [ks] "+r" (ks) + : [L_AES_Thumb2_te] "r" (L_AES_Thumb2_te), [L_AES_Thumb2_rcon] "r" (L_AES_Thumb2_rcon) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10" ); } @@ -562,7 +582,7 @@ void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks) __asm__ __volatile__ ( "\n" - "L_AES_encrypt_block_nr_%=:\n\t" + "L_AES_encrypt_block_nr:\n\t" "UBFX r8, r5, #16, #8\n\t" "LSR r11, r4, #24\n\t" "UBFX lr, r6, #8, #8\n\t" @@ -664,10 +684,10 @@ void AES_encrypt_block(const uint32_t* te, int nr, int len, const uint32_t* ks) "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_encrypt_block_nr_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_encrypt_block_nr\n\t" #else - "BNE.N L_AES_encrypt_block_nr_%=\n\t" + "BNE.W L_AES_encrypt_block_nr\n\t" #endif "UBFX r8, r5, #16, #8\n\t" "LSR r11, r4, #24\n\t" @@ -793,28 +813,32 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long register unsigned long len __asm__ ("r2") = (unsigned long)len_p; register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r5") = (uint32_t*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "PUSH {%[ks]}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_ECB_encrypt_start_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_ECB_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_ECB_encrypt_start_block_128_%=\n\t" + "BEQ.W L_AES_ECB_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_ECB_encrypt_start_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_ECB_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_ECB_encrypt_start_block_192_%=\n\t" + "BEQ.W L_AES_ECB_encrypt_start_block_192\n\t" #endif "\n" - "L_AES_ECB_encrypt_loop_block_256_%=:\n\t" + "L_AES_ECB_encrypt_loop_block_256:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -845,16 +869,20 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_encrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_256\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end\n\t" #endif - "B L_AES_ECB_encrypt_end_%=\n\t" "\n" - "L_AES_ECB_encrypt_start_block_192_%=:\n\t" + "L_AES_ECB_encrypt_start_block_192:\n\t" "\n" - "L_AES_ECB_encrypt_loop_block_192_%=:\n\t" + "L_AES_ECB_encrypt_loop_block_192:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -885,16 +913,20 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_encrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_192\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_ECB_encrypt_end\n\t" +#else + "B.N L_AES_ECB_encrypt_end\n\t" #endif - "B L_AES_ECB_encrypt_end_%=\n\t" "\n" - "L_AES_ECB_encrypt_start_block_128_%=:\n\t" + "L_AES_ECB_encrypt_start_block_128:\n\t" "\n" - "L_AES_ECB_encrypt_loop_block_128_%=:\n\t" + "L_AES_ECB_encrypt_loop_block_128:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -925,16 +957,22 @@ void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_encrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_ECB_encrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_ECB_encrypt_loop_block_128\n\t" #endif "\n" - "L_AES_ECB_encrypt_end_%=:\n\t" + "L_AES_ECB_encrypt_end:\n\t" "POP {%[ks]}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -956,30 +994,38 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r9, r5\n\t" +#else + "LDR r9, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" "LDM r9, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r9}\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_encrypt_start_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_CBC_encrypt_start_block_128_%=\n\t" + "BEQ.W L_AES_CBC_encrypt_start_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_encrypt_start_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_CBC_encrypt_start_block_192_%=\n\t" + "BEQ.W L_AES_CBC_encrypt_start_block_192\n\t" #endif "\n" - "L_AES_CBC_encrypt_loop_block_256_%=:\n\t" + "L_AES_CBC_encrypt_loop_block_256:\n\t" "LDR r8, [lr]\n\t" "LDR r9, [lr, #4]\n\t" "LDR r10, [lr, #8]\n\t" @@ -1014,16 +1060,20 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_encrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_256\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end\n\t" #endif - "B L_AES_CBC_encrypt_end_%=\n\t" "\n" - "L_AES_CBC_encrypt_start_block_192_%=:\n\t" + "L_AES_CBC_encrypt_start_block_192:\n\t" "\n" - "L_AES_CBC_encrypt_loop_block_192_%=:\n\t" + "L_AES_CBC_encrypt_loop_block_192:\n\t" "LDR r8, [lr]\n\t" "LDR r9, [lr, #4]\n\t" "LDR r10, [lr, #8]\n\t" @@ -1058,16 +1108,20 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_encrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_192\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_CBC_encrypt_end\n\t" +#else + "B.N L_AES_CBC_encrypt_end\n\t" #endif - "B L_AES_CBC_encrypt_end_%=\n\t" "\n" - "L_AES_CBC_encrypt_start_block_128_%=:\n\t" + "L_AES_CBC_encrypt_start_block_128:\n\t" "\n" - "L_AES_CBC_encrypt_loop_block_128_%=:\n\t" + "L_AES_CBC_encrypt_loop_block_128:\n\t" "LDR r8, [lr]\n\t" "LDR r9, [lr, #4]\n\t" "LDR r10, [lr, #8]\n\t" @@ -1102,17 +1156,23 @@ void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_encrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CBC_encrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_CBC_encrypt_loop_block_128\n\t" #endif "\n" - "L_AES_CBC_encrypt_end_%=:\n\t" + "L_AES_CBC_encrypt_end:\n\t" "POP {%[ks], r9}\n\t" "STM r9, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1134,12 +1194,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_ecb; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_ecb]\n\t" "LDM r8, {r4, r5, r6, r7}\n\t" @@ -1150,19 +1218,19 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CTR_encrypt_start_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CTR_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_CTR_encrypt_start_block_128_%=\n\t" + "BEQ.W L_AES_CTR_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CTR_encrypt_start_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CTR_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_CTR_encrypt_start_block_192_%=\n\t" + "BEQ.W L_AES_CTR_encrypt_start_block_192\n\t" #endif "\n" - "L_AES_CTR_encrypt_loop_block_256_%=:\n\t" + "L_AES_CTR_encrypt_loop_block_256:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADDS r11, r7, #0x1\n\t" @@ -1201,16 +1269,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CTR_encrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CTR_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_256\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end\n\t" #endif - "B L_AES_CTR_encrypt_end_%=\n\t" "\n" - "L_AES_CTR_encrypt_start_block_192_%=:\n\t" + "L_AES_CTR_encrypt_start_block_192:\n\t" "\n" - "L_AES_CTR_encrypt_loop_block_192_%=:\n\t" + "L_AES_CTR_encrypt_loop_block_192:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADDS r11, r7, #0x1\n\t" @@ -1249,16 +1321,20 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CTR_encrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CTR_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_192\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_CTR_encrypt_end\n\t" +#else + "B.W L_AES_CTR_encrypt_end\n\t" #endif - "B L_AES_CTR_encrypt_end_%=\n\t" "\n" - "L_AES_CTR_encrypt_start_block_128_%=:\n\t" + "L_AES_CTR_encrypt_start_block_128:\n\t" "\n" - "L_AES_CTR_encrypt_loop_block_128_%=:\n\t" + "L_AES_CTR_encrypt_loop_block_128:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADDS r11, r7, #0x1\n\t" @@ -1297,21 +1373,27 @@ void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CTR_encrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CTR_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CTR_encrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_CTR_encrypt_loop_block_128\n\t" #endif "\n" - "L_AES_CTR_encrypt_end_%=:\n\t" + "L_AES_CTR_encrypt_end:\n\t" "POP {%[ks], r8}\n\t" "REV r4, r4\n\t" "REV r5, r5\n\t" "REV r6, r6\n\t" "REV r7, r7\n\t" "STM r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_ecb] "+r" (L_AES_Thumb2_te_ecb_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_Thumb2_te_ecb] "r" (L_AES_Thumb2_te_ecb) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1334,7 +1416,7 @@ void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4) __asm__ __volatile__ ( "\n" - "L_AES_decrypt_block_nr_%=:\n\t" + "L_AES_decrypt_block_nr:\n\t" "UBFX r8, r7, #16, #8\n\t" "LSR r11, r4, #24\n\t" "UBFX r12, r6, #8, #8\n\t" @@ -1436,10 +1518,10 @@ void AES_decrypt_block(const uint32_t* td, int nr, const uint8_t* td4) "EOR r6, r6, r10\n\t" "EOR r7, r7, r11\n\t" "SUBS %[nr], %[nr], #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_decrypt_block_nr_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_decrypt_block_nr\n\t" #else - "BNE.N L_AES_decrypt_block_nr_%=\n\t" + "BNE.W L_AES_decrypt_block_nr\n\t" #endif "UBFX r8, r7, #16, #8\n\t" "LSR r11, r4, #24\n\t" @@ -1598,30 +1680,34 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long register unsigned long len __asm__ ("r2") = (unsigned long)len_p; register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_td_ecb_c __asm__ ("r5") = (uint32_t*)L_AES_Thumb2_td_ecb; register unsigned char* L_AES_Thumb2_td4_c __asm__ ("r6") = (unsigned char*)&L_AES_Thumb2_td4; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" "MOV r12, %[len]\n\t" "MOV r2, %[L_AES_Thumb2_td4]\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_ECB_decrypt_start_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_ECB_decrypt_start_block_128\n\t" #else - "BEQ.N L_AES_ECB_decrypt_start_block_128_%=\n\t" + "BEQ.W L_AES_ECB_decrypt_start_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_ECB_decrypt_start_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_ECB_decrypt_start_block_192\n\t" #else - "BEQ.N L_AES_ECB_decrypt_start_block_192_%=\n\t" + "BEQ.W L_AES_ECB_decrypt_start_block_192\n\t" #endif "\n" - "L_AES_ECB_decrypt_loop_block_256_%=:\n\t" + "L_AES_ECB_decrypt_loop_block_256:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -1651,16 +1737,20 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_decrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_decrypt_loop_block_256\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_256\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end\n\t" #endif - "B L_AES_ECB_decrypt_end_%=\n\t" "\n" - "L_AES_ECB_decrypt_start_block_192_%=:\n\t" + "L_AES_ECB_decrypt_start_block_192:\n\t" "\n" - "L_AES_ECB_decrypt_loop_block_192_%=:\n\t" + "L_AES_ECB_decrypt_loop_block_192:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -1690,16 +1780,20 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_decrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_decrypt_loop_block_192\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_192\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_ECB_decrypt_end\n\t" +#else + "B.N L_AES_ECB_decrypt_end\n\t" #endif - "B L_AES_ECB_decrypt_end_%=\n\t" "\n" - "L_AES_ECB_decrypt_start_block_128_%=:\n\t" + "L_AES_ECB_decrypt_start_block_128:\n\t" "\n" - "L_AES_ECB_decrypt_loop_block_128_%=:\n\t" + "L_AES_ECB_decrypt_loop_block_128:\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" "LDR r6, [lr, #8]\n\t" @@ -1729,15 +1823,21 @@ void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_ECB_decrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_ECB_decrypt_loop_block_128\n\t" #else - "BNE.N L_AES_ECB_decrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_ECB_decrypt_loop_block_128\n\t" #endif "\n" - "L_AES_ECB_decrypt_end_%=:\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) + "L_AES_ECB_decrypt_end:\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -1759,32 +1859,40 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; register unsigned char* iv __asm__ ("r5") = (unsigned char*)iv_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_td_ecb_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_td_ecb; register unsigned char* L_AES_Thumb2_td4_c __asm__ ("r7") = (unsigned char*)&L_AES_Thumb2_td4; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r4\n\t" +#else + "LDR r8, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r4, r5\n\t" +#else + "LDR r4, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_td_ecb]\n\t" "MOV r12, %[len]\n\t" "MOV r2, %[L_AES_Thumb2_td4]\n\t" "PUSH {%[ks], r4}\n\t" "CMP r8, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_decrypt_loop_block_128\n\t" #else - "BEQ.N L_AES_CBC_decrypt_loop_block_128_%=\n\t" + "BEQ.W L_AES_CBC_decrypt_loop_block_128\n\t" #endif "CMP r8, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_decrypt_loop_block_192\n\t" #else - "BEQ.N L_AES_CBC_decrypt_loop_block_192_%=\n\t" + "BEQ.W L_AES_CBC_decrypt_loop_block_192\n\t" #endif "\n" - "L_AES_CBC_decrypt_loop_block_256_%=:\n\t" + "L_AES_CBC_decrypt_loop_block_256:\n\t" "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -1824,10 +1932,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -1869,14 +1977,18 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_decrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_decrypt_loop_block_256\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_256\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end\n\t" #endif - "B L_AES_CBC_decrypt_end_%=\n\t" "\n" - "L_AES_CBC_decrypt_loop_block_192_%=:\n\t" + "L_AES_CBC_decrypt_loop_block_192:\n\t" "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -1916,10 +2028,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -1961,14 +2073,18 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_decrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_decrypt_loop_block_192\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_192\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_CBC_decrypt_end\n\t" +#else + "B.W L_AES_CBC_decrypt_end\n\t" #endif - "B L_AES_CBC_decrypt_end_%=\n\t" "\n" - "L_AES_CBC_decrypt_loop_block_128_%=:\n\t" + "L_AES_CBC_decrypt_loop_block_128:\n\t" "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" "LDR r5, [lr, #4]\n\t" @@ -2008,10 +2124,10 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_CBC_decrypt_end_odd_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_CBC_decrypt_end_odd\n\t" #else - "BEQ.N L_AES_CBC_decrypt_end_odd_%=\n\t" + "BEQ.W L_AES_CBC_decrypt_end_odd\n\t" #endif "PUSH {r1, r12, lr}\n\t" "LDR r4, [lr]\n\t" @@ -2053,24 +2169,34 @@ void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS r12, r12, #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_CBC_decrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_CBC_decrypt_loop_block_128\n\t" #else - "BNE.N L_AES_CBC_decrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_CBC_decrypt_loop_block_128\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_AES_CBC_decrypt_end\n\t" +#else + "B.N L_AES_CBC_decrypt_end\n\t" #endif - "B L_AES_CBC_decrypt_end_%=\n\t" "\n" - "L_AES_CBC_decrypt_end_odd_%=:\n\t" + "L_AES_CBC_decrypt_end_odd:\n\t" "LDR r4, [sp, #4]\n\t" "LDRD r8, r9, [r4, #16]\n\t" "LDRD r10, r11, [r4, #24]\n\t" "STRD r8, r9, [r4]\n\t" "STRD r10, r11, [r4, #8]\n\t" "\n" - "L_AES_CBC_decrypt_end_%=:\n\t" + "L_AES_CBC_decrypt_end:\n\t" "POP {%[ks], r4}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv), + [L_AES_Thumb2_td_ecb] "+r" (L_AES_Thumb2_td_ecb_c), [L_AES_Thumb2_td4] "+r" (L_AES_Thumb2_td4_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [iv] "+r" (iv) + : [L_AES_Thumb2_td_ecb] "r" (L_AES_Thumb2_td_ecb), [L_AES_Thumb2_td4] "r" (L_AES_Thumb2_td4) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r8", "r9", "r10", "r11" ); } @@ -2099,13 +2225,13 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned cha register const unsigned char** m __asm__ ("r1") = (const unsigned char**)m_p; register const unsigned char* data __asm__ ("r2") = (const unsigned char*)data_p; register unsigned long len __asm__ ("r3") = (unsigned long)len_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_GCM_gmult_len_r_c __asm__ ("r4") = (uint32_t*)&L_GCM_gmult_len_r; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( "MOV lr, %[L_GCM_gmult_len_r]\n\t" "\n" - "L_GCM_gmult_len_start_block_%=:\n\t" + "L_GCM_gmult_len_start_block:\n\t" "PUSH {r3}\n\t" "LDR r12, [r0, #12]\n\t" "LDR %[len], [r2, #12]\n\t" @@ -2650,13 +2776,19 @@ void GCM_gmult_len(unsigned char* x, const unsigned char** m, const unsigned cha "POP {r3}\n\t" "SUBS %[len], %[len], #0x10\n\t" "ADD %[data], %[data], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_GCM_gmult_len_start_block_%=\n\t" +#ifdef __GNUC__ + "BNE L_GCM_gmult_len_start_block\n\t" #else - "BNE.N L_GCM_gmult_len_start_block_%=\n\t" + "BNE.W L_GCM_gmult_len_start_block\n\t" #endif - : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len), + [L_GCM_gmult_len_r] "+r" (L_GCM_gmult_len_r_c) : +#else + : [x] "+r" (x), [m] "+r" (m), [data] "+r" (data), [len] "+r" (len) + : [L_GCM_gmult_len_r] "r" (L_GCM_gmult_len_r) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r5", "r6", "r7", "r8", "r9", "r10", "r11" ); } @@ -2677,12 +2809,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long register const unsigned char* ks __asm__ ("r3") = (const unsigned char*)ks_p; register int nr __asm__ ("r4") = (int)nr_p; register unsigned char* ctr __asm__ ("r5") = (unsigned char*)ctr_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_AES_Thumb2_te_gcm_c __asm__ ("r6") = (uint32_t*)L_AES_Thumb2_te_gcm; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r12, r4\n\t" +#else + "LDR r12, [sp, #36]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG "MOV r8, r5\n\t" +#else + "LDR r8, [sp, #40]\n\t" +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ "MOV lr, %[in]\n\t" "MOV r0, %[L_AES_Thumb2_te_gcm]\n\t" "LDM r8, {r4, r5, r6, r7}\n\t" @@ -2693,19 +2833,19 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "STM r8, {r4, r5, r6, r7}\n\t" "PUSH {%[ks], r8}\n\t" "CMP r12, #0xa\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_GCM_encrypt_start_block_128_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_GCM_encrypt_start_block_128\n\t" #else - "BEQ.N L_AES_GCM_encrypt_start_block_128_%=\n\t" + "BEQ.W L_AES_GCM_encrypt_start_block_128\n\t" #endif "CMP r12, #0xc\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_AES_GCM_encrypt_start_block_192_%=\n\t" +#ifdef __GNUC__ + "BEQ L_AES_GCM_encrypt_start_block_192\n\t" #else - "BEQ.N L_AES_GCM_encrypt_start_block_192_%=\n\t" + "BEQ.W L_AES_GCM_encrypt_start_block_192\n\t" #endif "\n" - "L_AES_GCM_encrypt_loop_block_256_%=:\n\t" + "L_AES_GCM_encrypt_loop_block_256:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADD r7, r7, #0x1\n\t" @@ -2741,16 +2881,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_GCM_encrypt_loop_block_256_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_GCM_encrypt_loop_block_256\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_256_%=\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_256\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end\n\t" #endif - "B L_AES_GCM_encrypt_end_%=\n\t" "\n" - "L_AES_GCM_encrypt_start_block_192_%=:\n\t" + "L_AES_GCM_encrypt_start_block_192:\n\t" "\n" - "L_AES_GCM_encrypt_loop_block_192_%=:\n\t" + "L_AES_GCM_encrypt_loop_block_192:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADD r7, r7, #0x1\n\t" @@ -2786,16 +2930,20 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_GCM_encrypt_loop_block_192_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_GCM_encrypt_loop_block_192\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_192_%=\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_192\n\t" +#endif +#ifdef __GNUC__ + "B L_AES_GCM_encrypt_end\n\t" +#else + "B.W L_AES_GCM_encrypt_end\n\t" #endif - "B L_AES_GCM_encrypt_end_%=\n\t" "\n" - "L_AES_GCM_encrypt_start_block_128_%=:\n\t" + "L_AES_GCM_encrypt_start_block_128:\n\t" "\n" - "L_AES_GCM_encrypt_loop_block_128_%=:\n\t" + "L_AES_GCM_encrypt_loop_block_128:\n\t" "PUSH {r1, %[len], lr}\n\t" "LDR lr, [sp, #16]\n\t" "ADD r7, r7, #0x1\n\t" @@ -2831,21 +2979,27 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long "SUBS %[len], %[len], #0x10\n\t" "ADD lr, lr, #0x10\n\t" "ADD %[out], %[out], #0x10\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_AES_GCM_encrypt_loop_block_128_%=\n\t" +#ifdef __GNUC__ + "BNE L_AES_GCM_encrypt_loop_block_128\n\t" #else - "BNE.N L_AES_GCM_encrypt_loop_block_128_%=\n\t" + "BNE.W L_AES_GCM_encrypt_loop_block_128\n\t" #endif "\n" - "L_AES_GCM_encrypt_end_%=:\n\t" + "L_AES_GCM_encrypt_end:\n\t" "POP {%[ks], r8}\n\t" "REV r4, r4\n\t" "REV r5, r5\n\t" "REV r6, r6\n\t" "REV r7, r7\n\t" "STM r8, {r4, r5, r6, r7}\n\t" - : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr), + [L_AES_Thumb2_te_gcm] "+r" (L_AES_Thumb2_te_gcm_c) : +#else + : [in] "+r" (in), [out] "+r" (out), [len] "+r" (len), [ks] "+r" (ks), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [L_AES_Thumb2_te_gcm] "r" (L_AES_Thumb2_te_gcm) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r12", "lr", "r7", "r8", "r9", "r10", "r11" ); } @@ -2854,7 +3008,7 @@ void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, unsigned long #endif /* !NO_AES */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* !defined(__aarch64__) && defined(__thumb__) */ #endif /* WOLFSSL_ARMASM */ #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S index c5ca56b18..e6b5dcf5d 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519.S +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -43,7 +43,7 @@ .type fe_init, %function fe_init: BX lr - # Cycle Count = 4 + /* Cycle Count = 4 */ .size fe_init,.-fe_init .text .align 4 @@ -51,62 +51,62 @@ fe_init: .type fe_add_sub_op, %function fe_add_sub_op: PUSH {lr} - # Add-Sub + /* Add-Sub */ LDRD r4, r5, [r2] LDRD r6, r7, [r3] - # Add + /* Add */ ADDS r8, r4, r6 MOV r12, #0x0 ADCS r9, r5, r7 ADC r12, r12, #0x0 STRD r8, r9, [r0] - # Sub + /* Sub */ SUBS r10, r4, r6 SBCS r11, r5, r7 STRD r10, r11, [r1] LDRD r4, r5, [r2, #8] LDRD r6, r7, [r3, #8] - # Sub + /* Sub */ SBCS r10, r4, r6 MOV lr, #0x0 SBCS r11, r5, r7 ADC lr, lr, #0x0 STRD r10, r11, [r1, #8] - # Add + /* Add */ SUBS r12, r12, #0x1 ADCS r8, r4, r6 ADCS r9, r5, r7 STRD r8, r9, [r0, #8] LDRD r4, r5, [r2, #16] LDRD r6, r7, [r3, #16] - # Add + /* Add */ ADCS r8, r4, r6 MOV r12, #0x0 ADCS r9, r5, r7 ADC r12, r12, #0x0 STRD r8, r9, [r0, #16] - # Sub + /* Sub */ SUBS lr, lr, #0x1 SBCS r10, r4, r6 SBCS r11, r5, r7 STRD r10, r11, [r1, #16] LDRD r4, r5, [r2, #24] LDRD r6, r7, [r3, #24] - # Sub + /* Sub */ SBCS r10, r4, r6 SBC r11, r5, r7 - # Add + /* Add */ SUBS r12, r12, #0x1 ADCS r8, r4, r6 MOV r12, #0x0 ADCS r9, r5, r7 ADC r12, r12, #0x0 - # Multiply -modulus by overflow + /* Multiply -modulus by overflow */ LSL r3, r12, #1 MOV r12, #0x13 ORR r3, r3, r9, LSR #31 MUL r12, r3, r12 - # Add -x*modulus (if overflow) + /* Add -x*modulus (if overflow) */ LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] ADDS r4, r4, r12 @@ -123,7 +123,7 @@ fe_add_sub_op: ADCS r8, r8, #0x0 ADC r9, r9, #0x0 STRD r8, r9, [r0, #24] - # Add -modulus on underflow + /* Add -modulus on underflow */ MOV lr, #0x13 AND lr, lr, r11, ASR #31 LDM r1, {r4, r5, r6, r7, r8, r9} @@ -137,9 +137,9 @@ fe_add_sub_op: SBCS r10, r10, #0x0 SBC r11, r11, #0x0 STM r1, {r4, r5, r6, r7, r8, r9, r10, r11} - # Done Add-Sub + /* Done Add-Sub */ POP {pc} - # Cycle Count = 134 + /* Cycle Count = 134 */ .size fe_add_sub_op,.-fe_add_sub_op .text .align 4 @@ -147,7 +147,7 @@ fe_add_sub_op: .type fe_sub_op, %function fe_sub_op: PUSH {lr} - # Sub + /* Sub */ LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr} LDM r1!, {r2, r3, r4, r5} SUBS r6, r2, r6 @@ -171,9 +171,9 @@ fe_sub_op: SBCS r12, r12, #0x0 SBC lr, lr, #0x0 STM r0, {r6, r7, r8, r9, r10, r11, r12, lr} - # Done Sub + /* Done Sub */ POP {pc} - # Cycle Count = 51 + /* Cycle Count = 51 */ .size fe_sub_op,.-fe_sub_op .text .align 4 @@ -183,7 +183,7 @@ fe_sub: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} BL fe_sub_op POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 24 + /* Cycle Count = 24 */ .size fe_sub,.-fe_sub .text .align 4 @@ -191,7 +191,7 @@ fe_sub: .type fe_add_op, %function fe_add_op: PUSH {lr} - # Add + /* Add */ LDM r2!, {r6, r7, r8, r9, r10, r11, r12, lr} LDM r1!, {r2, r3, r4, r5} ADDS r6, r2, r6 @@ -215,9 +215,9 @@ fe_add_op: ADCS r12, r12, #0x0 ADC lr, lr, #0x0 STM r0, {r6, r7, r8, r9, r10, r11, r12, lr} - # Done Add + /* Done Add */ POP {pc} - # Cycle Count = 51 + /* Cycle Count = 51 */ .size fe_add_op,.-fe_add_op .text .align 4 @@ -227,7 +227,7 @@ fe_add: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} BL fe_add_op POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 24 + /* Cycle Count = 24 */ .size fe_add,.-fe_add #ifdef HAVE_ED25519 .text @@ -254,7 +254,7 @@ fe_frombytes: STR r8, [r0, #24] STR r9, [r0, #28] POP {r4, r5, r6, r7, r8, r9, pc} - # Cycle Count = 49 + /* Cycle Count = 49 */ .size fe_frombytes,.-fe_frombytes .text .align 4 @@ -291,7 +291,7 @@ fe_tobytes: STR r8, [r0, #24] STR r9, [r0, #28] POP {r4, r5, r6, r7, r8, r9, r10, pc} - # Cycle Count = 62 + /* Cycle Count = 62 */ .size fe_tobytes,.-fe_tobytes .text .align 4 @@ -299,7 +299,7 @@ fe_tobytes: .type fe_1, %function fe_1: PUSH {r4, r5, r6, r7, r8, r9, lr} - # Set one + /* Set one */ MOV r2, #0x1 MOV r3, #0x0 MOV r4, #0x0 @@ -310,7 +310,7 @@ fe_1: MOV r9, #0x0 STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} POP {r4, r5, r6, r7, r8, r9, pc} - # Cycle Count = 33 + /* Cycle Count = 33 */ .size fe_1,.-fe_1 .text .align 4 @@ -318,7 +318,7 @@ fe_1: .type fe_0, %function fe_0: PUSH {r4, r5, r6, r7, r8, r9, lr} - # Set zero + /* Set zero */ MOV r2, #0x0 MOV r3, #0x0 MOV r4, #0x0 @@ -329,7 +329,7 @@ fe_0: MOV r9, #0x0 STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} POP {r4, r5, r6, r7, r8, r9, pc} - # Cycle Count = 33 + /* Cycle Count = 33 */ .size fe_0,.-fe_0 .text .align 4 @@ -337,7 +337,7 @@ fe_0: .type fe_copy, %function fe_copy: PUSH {r4, r5, lr} - # Copy + /* Copy */ LDRD r2, r3, [r1] LDRD r4, r5, [r1, #8] STRD r2, r3, [r0] @@ -347,7 +347,7 @@ fe_copy: STRD r2, r3, [r0, #16] STRD r4, r5, [r0, #24] POP {r4, r5, pc} - # Cycle Count = 32 + /* Cycle Count = 32 */ .size fe_copy,.-fe_copy .text .align 4 @@ -371,7 +371,7 @@ fe_neg: SBC r5, r6, r5 STM r0!, {r2, r3, r4, r5} POP {r4, r5, r6, r7, pc} - # Cycle Count = 43 + /* Cycle Count = 43 */ .size fe_neg,.-fe_neg .text .align 4 @@ -407,7 +407,7 @@ fe_isnonzero: ORR r2, r2, r8 ORR r0, r2, r4 POP {r4, r5, r6, r7, r8, r9, r10, pc} - # Cycle Count = 53 + /* Cycle Count = 53 */ .size fe_isnonzero,.-fe_isnonzero .text .align 4 @@ -430,7 +430,7 @@ fe_isnegative: LSR r1, r1, #31 EOR r0, r0, r1 POP {r4, r5, pc} - # Cycle Count = 31 + /* Cycle Count = 31 */ .size fe_isnegative,.-fe_isnegative #if defined(HAVE_ED25519_MAKE_KEY) || defined(HAVE_ED25519_SIGN) #ifndef WC_NO_CACHE_RESISTANT @@ -1404,7 +1404,7 @@ fe_cmov_table: STRD r6, r7, [r0, #56] STRD r8, r9, [r0, #88] POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 1195 + /* Cycle Count = 1195 */ .size fe_cmov_table,.-fe_cmov_table #else .text @@ -1506,7 +1506,7 @@ fe_cmov_table: STM r0!, {r4, r5, r6, r7} SUB r1, r1, r2 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 160 + /* Cycle Count = 160 */ .size fe_cmov_table,.-fe_cmov_table #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ @@ -1522,329 +1522,329 @@ fe_mul_op: STR r0, [sp, #36] MOV r0, #0x0 LDR r12, [r1] - # A[0] * B[0] + /* A[0] * B[0] */ LDR lr, [r2] UMULL r3, r4, r12, lr - # A[0] * B[2] + /* A[0] * B[2] */ LDR lr, [r2, #8] UMULL r5, r6, r12, lr - # A[0] * B[4] + /* A[0] * B[4] */ LDR lr, [r2, #16] UMULL r7, r8, r12, lr - # A[0] * B[6] + /* A[0] * B[6] */ LDR lr, [r2, #24] UMULL r9, r10, r12, lr STR r3, [sp] - # A[0] * B[1] + /* A[0] * B[1] */ LDR lr, [r2, #4] MOV r11, r0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[0] * B[3] + /* A[0] * B[3] */ LDR lr, [r2, #12] ADCS r6, r6, #0x0 ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[0] * B[5] + /* A[0] * B[5] */ LDR lr, [r2, #20] ADCS r8, r8, #0x0 ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[0] * B[7] + /* A[0] * B[7] */ LDR lr, [r2, #28] ADCS r10, r10, #0x0 ADC r3, r0, #0x0 UMLAL r10, r3, r12, lr - # A[1] * B[0] + /* A[1] * B[0] */ LDR r12, [r1, #4] LDR lr, [r2] MOV r11, #0x0 UMLAL r4, r11, r12, lr STR r4, [sp, #4] ADDS r5, r5, r11 - # A[1] * B[1] + /* A[1] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[1] * B[2] + /* A[1] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[1] * B[3] + /* A[1] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[1] * B[4] + /* A[1] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[1] * B[5] + /* A[1] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[1] * B[6] + /* A[1] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[1] * B[7] + /* A[1] * B[7] */ LDR lr, [r2, #28] ADC r4, r0, #0x0 UMLAL r3, r4, r12, lr - # A[2] * B[0] + /* A[2] * B[0] */ LDR r12, [r1, #8] LDR lr, [r2] MOV r11, #0x0 UMLAL r5, r11, r12, lr STR r5, [sp, #8] ADDS r6, r6, r11 - # A[2] * B[1] + /* A[2] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[2] * B[2] + /* A[2] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[2] * B[3] + /* A[2] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[2] * B[4] + /* A[2] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[2] * B[5] + /* A[2] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[2] * B[6] + /* A[2] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[2] * B[7] + /* A[2] * B[7] */ LDR lr, [r2, #28] ADC r5, r0, #0x0 UMLAL r4, r5, r12, lr - # A[3] * B[0] + /* A[3] * B[0] */ LDR r12, [r1, #12] LDR lr, [r2] MOV r11, #0x0 UMLAL r6, r11, r12, lr STR r6, [sp, #12] ADDS r7, r7, r11 - # A[3] * B[1] + /* A[3] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[3] * B[2] + /* A[3] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[3] * B[3] + /* A[3] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[3] * B[4] + /* A[3] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[3] * B[5] + /* A[3] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[3] * B[6] + /* A[3] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[3] * B[7] + /* A[3] * B[7] */ LDR lr, [r2, #28] ADC r6, r0, #0x0 UMLAL r5, r6, r12, lr - # A[4] * B[0] + /* A[4] * B[0] */ LDR r12, [r1, #16] LDR lr, [r2] MOV r11, #0x0 UMLAL r7, r11, r12, lr STR r7, [sp, #16] ADDS r8, r8, r11 - # A[4] * B[1] + /* A[4] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[4] * B[2] + /* A[4] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[4] * B[3] + /* A[4] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[4] * B[4] + /* A[4] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[4] * B[5] + /* A[4] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[4] * B[6] + /* A[4] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[4] * B[7] + /* A[4] * B[7] */ LDR lr, [r2, #28] ADC r7, r0, #0x0 UMLAL r6, r7, r12, lr - # A[5] * B[0] + /* A[5] * B[0] */ LDR r12, [r1, #20] LDR lr, [r2] MOV r11, #0x0 UMLAL r8, r11, r12, lr STR r8, [sp, #20] ADDS r9, r9, r11 - # A[5] * B[1] + /* A[5] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[5] * B[2] + /* A[5] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[5] * B[3] + /* A[5] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[5] * B[4] + /* A[5] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[5] * B[5] + /* A[5] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[5] * B[6] + /* A[5] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[5] * B[7] + /* A[5] * B[7] */ LDR lr, [r2, #28] ADC r8, r0, #0x0 UMLAL r7, r8, r12, lr - # A[6] * B[0] + /* A[6] * B[0] */ LDR r12, [r1, #24] LDR lr, [r2] MOV r11, #0x0 UMLAL r9, r11, r12, lr STR r9, [sp, #24] ADDS r10, r10, r11 - # A[6] * B[1] + /* A[6] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[6] * B[2] + /* A[6] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[6] * B[3] + /* A[6] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[6] * B[4] + /* A[6] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[6] * B[5] + /* A[6] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[6] * B[6] + /* A[6] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[6] * B[7] + /* A[6] * B[7] */ LDR lr, [r2, #28] ADC r9, r0, #0x0 UMLAL r8, r9, r12, lr - # A[7] * B[0] + /* A[7] * B[0] */ LDR r12, [r1, #28] LDR lr, [r2] MOV r11, #0x0 UMLAL r10, r11, r12, lr STR r10, [sp, #28] ADDS r3, r3, r11 - # A[7] * B[1] + /* A[7] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[7] * B[2] + /* A[7] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[7] * B[3] + /* A[7] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[7] * B[4] + /* A[7] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[7] * B[5] + /* A[7] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[7] * B[6] + /* A[7] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[7] * B[7] + /* A[7] * B[7] */ LDR lr, [r2, #28] ADC r10, r0, #0x0 UMLAL r9, r10, r12, lr - # Reduce + /* Reduce */ LDR r2, [sp, #28] MOV lr, sp MOV r12, #0x26 @@ -1883,12 +1883,12 @@ fe_mul_op: UMLAL r7, r11, r9, r12 BFC r10, #31, #1 ADDS r8, r10, r11 - # Store + /* Store */ LDR r0, [sp, #36] STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} ADD sp, sp, #0x28 POP {pc} - # Cycle Count = 406 + /* Cycle Count = 406 */ .size fe_mul_op,.-fe_mul_op #else .text @@ -1995,7 +1995,7 @@ fe_mul_op: UMAAL r8, r10, r2, lr UMAAL r8, r9, r3, r11 UMAAL r9, r10, r3, lr - # Reduce + /* Reduce */ LDR r0, [sp, #28] MOV lr, #0x25 UMAAL r10, r0, r10, lr @@ -2017,11 +2017,11 @@ fe_mul_op: UMAAL r6, r11, r9, lr ADD r7, r10, r11 LDR lr, [sp, #8] - # Store + /* Store */ STM lr, {r0, r1, r2, r3, r4, r5, r6, r7} ADD sp, sp, #0x10 POP {pc} - # Cycle Count = 239 + /* Cycle Count = 239 */ .size fe_mul_op,.-fe_mul_op #endif /* WOLFSSL_SP_NO_UMAAL */ .text @@ -2032,7 +2032,7 @@ fe_mul: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} BL fe_mul_op POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 24 + /* Cycle Count = 24 */ .size fe_mul,.-fe_mul #ifdef WOLFSSL_SP_NO_UMAAL .text @@ -2043,33 +2043,33 @@ fe_sq_op: PUSH {lr} SUB sp, sp, #0x44 STR r0, [sp, #64] - # Square + /* Square */ MOV r0, #0x0 LDR r12, [r1] - # A[0] * A[1] + /* A[0] * A[1] */ LDR lr, [r1, #4] UMULL r4, r5, r12, lr - # A[0] * A[3] + /* A[0] * A[3] */ LDR lr, [r1, #12] UMULL r6, r7, r12, lr - # A[0] * A[5] + /* A[0] * A[5] */ LDR lr, [r1, #20] UMULL r8, r9, r12, lr - # A[0] * A[7] + /* A[0] * A[7] */ LDR lr, [r1, #28] UMULL r10, r3, r12, lr - # A[0] * A[2] + /* A[0] * A[2] */ LDR lr, [r1, #8] MOV r11, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[0] * A[4] + /* A[0] * A[4] */ LDR lr, [r1, #16] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[0] * A[6] + /* A[0] * A[6] */ LDR lr, [r1, #24] ADCS r9, r9, #0x0 ADC r11, r0, #0x0 @@ -2078,112 +2078,112 @@ fe_sq_op: ADCS r3, r3, #0x0 STR r4, [sp, #4] STR r5, [sp, #8] - # A[1] * A[2] + /* A[1] * A[2] */ LDR r12, [r1, #4] LDR lr, [r1, #8] MOV r11, #0x0 UMLAL r6, r11, r12, lr STR r6, [sp, #12] ADDS r7, r7, r11 - # A[1] * A[3] + /* A[1] * A[3] */ LDR lr, [r1, #12] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr STR r7, [sp, #16] ADDS r8, r8, r11 - # A[1] * A[4] + /* A[1] * A[4] */ LDR lr, [r1, #16] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[1] * A[5] + /* A[1] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[1] * A[6] + /* A[1] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[1] * A[7] + /* A[1] * A[7] */ LDR lr, [r1, #28] ADC r4, r0, #0x0 UMLAL r3, r4, r12, lr - # A[2] * A[3] + /* A[2] * A[3] */ LDR r12, [r1, #8] LDR lr, [r1, #12] MOV r11, #0x0 UMLAL r8, r11, r12, lr STR r8, [sp, #20] ADDS r9, r9, r11 - # A[2] * A[4] + /* A[2] * A[4] */ LDR lr, [r1, #16] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr STR r9, [sp, #24] ADDS r10, r10, r11 - # A[2] * A[5] + /* A[2] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[2] * A[6] + /* A[2] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[2] * A[7] + /* A[2] * A[7] */ LDR lr, [r1, #28] ADC r5, r0, #0x0 UMLAL r4, r5, r12, lr - # A[3] * A[4] + /* A[3] * A[4] */ LDR r12, [r1, #12] LDR lr, [r1, #16] MOV r11, #0x0 UMLAL r10, r11, r12, lr STR r10, [sp, #28] ADDS r3, r3, r11 - # A[3] * A[5] + /* A[3] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[3] * A[6] + /* A[3] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[3] * A[7] + /* A[3] * A[7] */ LDR lr, [r1, #28] ADC r6, r0, #0x0 UMLAL r5, r6, r12, lr - # A[4] * A[5] + /* A[4] * A[5] */ LDR r12, [r1, #16] LDR lr, [r1, #20] MOV r11, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[4] * A[6] + /* A[4] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[4] * A[7] + /* A[4] * A[7] */ LDR lr, [r1, #28] ADC r7, r0, #0x0 UMLAL r6, r7, r12, lr - # A[5] * A[6] + /* A[5] * A[6] */ LDR r12, [r1, #20] LDR lr, [r1, #24] MOV r11, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[5] * A[7] + /* A[5] * A[7] */ LDR lr, [r1, #28] ADC r8, r0, #0x0 UMLAL r7, r8, r12, lr - # A[6] * A[7] + /* A[6] * A[7] */ LDR r12, [r1, #24] LDR lr, [r1, #28] MOV r9, #0x0 @@ -2213,23 +2213,23 @@ fe_sq_op: ADD lr, sp, #0x4 LDM lr, {r4, r5, r6, r7, r8, r9, r10} MOV lr, sp - # A[0] * A[0] + /* A[0] * A[0] */ LDR r12, [r1] UMULL r3, r11, r12, r12 ADDS r4, r4, r11 - # A[1] * A[1] + /* A[1] * A[1] */ LDR r12, [r1, #4] ADCS r5, r5, #0x0 ADC r11, r0, #0x0 UMLAL r5, r11, r12, r12 ADDS r6, r6, r11 - # A[2] * A[2] + /* A[2] * A[2] */ LDR r12, [r1, #8] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, r12 ADDS r8, r8, r11 - # A[3] * A[3] + /* A[3] * A[3] */ LDR r12, [r1, #12] ADCS r9, r9, #0x0 ADC r11, r0, #0x0 @@ -2237,30 +2237,30 @@ fe_sq_op: ADDS r10, r10, r11 STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10} LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10} - # A[4] * A[4] + /* A[4] * A[4] */ LDR r12, [r1, #16] ADCS r3, r3, #0x0 ADC r11, r0, #0x0 UMLAL r3, r11, r12, r12 ADDS r4, r4, r11 - # A[5] * A[5] + /* A[5] * A[5] */ LDR r12, [r1, #20] ADCS r5, r5, #0x0 ADC r11, r0, #0x0 UMLAL r5, r11, r12, r12 ADDS r6, r6, r11 - # A[6] * A[6] + /* A[6] * A[6] */ LDR r12, [r1, #24] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, r12 ADDS r8, r8, r11 - # A[7] * A[7] + /* A[7] * A[7] */ LDR r12, [r1, #28] ADCS r9, r9, #0x0 ADC r10, r10, #0x0 UMLAL r9, r10, r12, r12 - # Reduce + /* Reduce */ LDR r2, [sp, #28] MOV lr, sp MOV r12, #0x26 @@ -2299,12 +2299,12 @@ fe_sq_op: UMLAL r7, r11, r9, r12 BFC r10, #31, #1 ADDS r8, r10, r11 - # Store + /* Store */ LDR r0, [sp, #64] STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} ADD sp, sp, #0x44 POP {pc} - # Cycle Count = 355 + /* Cycle Count = 355 */ .size fe_sq_op,.-fe_sq_op #else .text @@ -2316,7 +2316,7 @@ fe_sq_op: SUB sp, sp, #0x20 STR r0, [sp, #28] LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7} - # Square + /* Square */ UMULL r9, r10, r0, r0 UMULL r11, r12, r0, r1 ADDS r11, r11, r11 @@ -2357,46 +2357,46 @@ fe_sq_op: UMAAL r0, r10, r3, r4 ADCS r0, r0, r0 UMAAL r0, r11, lr, lr - # R[7] = r0 + /* R[7] = r0 */ UMAAL r9, r8, r1, r7 UMAAL r9, r10, r2, r6 UMAAL r12, r9, r3, r5 ADCS r12, r12, r12 UMAAL r12, r11, r4, r4 - # R[8] = r12 + /* R[8] = r12 */ UMAAL r9, r8, r2, r7 UMAAL r10, r9, r3, r6 MOV r2, lr UMAAL r10, r2, r4, r5 ADCS r10, r10, r10 UMAAL r11, r10, lr, lr - # R[9] = r11 + /* R[9] = r11 */ UMAAL r2, r8, r3, r7 UMAAL r2, r9, r4, r6 ADCS r3, r2, r2 UMAAL r10, r3, r5, r5 - # R[10] = r10 + /* R[10] = r10 */ MOV r1, lr UMAAL r1, r8, r4, r7 UMAAL r1, r9, r5, r6 ADCS r4, r1, r1 UMAAL r3, r4, lr, lr - # R[11] = r3 + /* R[11] = r3 */ UMAAL r8, r9, r5, r7 ADCS r8, r8, r8 UMAAL r4, r8, r6, r6 - # R[12] = r4 + /* R[12] = r4 */ MOV r5, lr UMAAL r5, r9, r6, r7 ADCS r5, r5, r5 UMAAL r8, r5, lr, lr - # R[13] = r8 + /* R[13] = r8 */ ADCS r9, r9, r9 UMAAL r9, r5, r7, r7 ADCS r7, r5, lr - # R[14] = r9 - # R[15] = r7 - # Reduce + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ MOV r6, #0x25 UMAAL r7, r0, r7, r6 MOV r6, #0x13 @@ -2420,10 +2420,10 @@ fe_sq_op: UMAAL r6, lr, r9, r12 ADD r7, r7, lr POP {lr} - # Store + /* Store */ STM lr, {r0, r1, r2, r3, r4, r5, r6, r7} POP {pc} - # Cycle Count = 179 + /* Cycle Count = 179 */ .size fe_sq_op,.-fe_sq_op #endif /* WOLFSSL_SP_NO_UMAAL */ .text @@ -2434,7 +2434,7 @@ fe_sq: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} BL fe_sq_op POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 24 + /* Cycle Count = 24 */ .size fe_sq,.-fe_sq #ifdef HAVE_CURVE25519 #ifdef WOLFSSL_SP_NO_UMAAL @@ -2444,7 +2444,7 @@ fe_sq: .type fe_mul121666, %function fe_mul121666: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} - # Multiply by 121666 + /* Multiply by 121666 */ LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} MOV r12, #0xdb42 MOVT r12, #0x1 @@ -2485,7 +2485,7 @@ fe_mul121666: ADC r9, r9, #0x0 STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 75 + /* Cycle Count = 75 */ .size fe_mul121666,.-fe_mul121666 #else .text @@ -2494,7 +2494,7 @@ fe_mul121666: .type fe_mul121666, %function fe_mul121666: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} - # Multiply by 121666 + /* Multiply by 121666 */ LDM r1, {r2, r3, r4, r5, r6, r7, r8, r9} MOV r11, #0xdb42 MOVT r11, #0x1 @@ -2522,7 +2522,7 @@ fe_mul121666: ADC r9, r9, #0x0 STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 69 + /* Cycle Count = 69 */ .size fe_mul121666,.-fe_mul121666 #endif /* WOLFSSL_SP_NO_UMAAL */ #ifndef WC_NO_CACHE_RESISTANT @@ -2553,7 +2553,7 @@ curve25519: MOV r3, sp STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} ADD r3, sp, #0x40 - # Copy + /* Copy */ LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11} STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} MOV r1, #0x1e @@ -2572,7 +2572,7 @@ L_curve25519_bits: EOR r1, r1, r2 STR r1, [sp, #172] LDR r0, [sp, #160] - # Conditional Swap + /* Conditional Swap */ RSB r1, r1, #0x0 MOV r3, r0 ADD r12, sp, #0x40 @@ -2625,7 +2625,7 @@ L_curve25519_bits: STM r3!, {r4, r5} STM r12!, {r6, r7} LDR r1, [sp, #172] - # Conditional Swap + /* Conditional Swap */ RSB r1, r1, #0x0 MOV r3, sp ADD r12, sp, #0x20 @@ -2741,21 +2741,21 @@ L_curve25519_bits: LDR r1, [sp, #180] SUBS r1, r1, #0x1 STR r1, [sp, #180] -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BGE L_curve25519_bits #else - BGE.N L_curve25519_bits + BGE.W L_curve25519_bits #endif MOV r1, #0x1f STR r1, [sp, #180] SUBS r2, r2, #0x4 STR r2, [sp, #176] -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BGE L_curve25519_words #else - BGE.N L_curve25519_words + BGE.W L_curve25519_words #endif - # Invert + /* Invert */ ADD r1, sp, #0x0 ADD r0, sp, #0x20 BL fe_sq_op @@ -2938,7 +2938,7 @@ L_curve25519_inv_8: MOV r0, #0x0 ADD sp, sp, #0xbc POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 682 + /* Cycle Count = 682 */ .size curve25519,.-curve25519 #else .text @@ -2973,7 +2973,7 @@ curve25519: MOV r3, sp STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} ADD r3, sp, #0x40 - # Copy + /* Copy */ LDM r2, {r4, r5, r6, r7, r8, r9, r10, r11} STM r3, {r4, r5, r6, r7, r8, r9, r10, r11} MOV r2, #0xfe @@ -2989,7 +2989,7 @@ L_curve25519_bits: EOR r1, r1, r2 ASR r1, r1, #31 STR r2, [sp, #164] - # Conditional Swap + /* Conditional Swap */ ADD r11, sp, #0xb0 LDM r11, {r4, r5, r6, r7} EOR r8, r4, r5 @@ -3001,7 +3001,7 @@ L_curve25519_bits: EOR r6, r6, r9 EOR r7, r7, r9 STM r11, {r4, r5, r6, r7} - # Ladder step + /* Ladder step */ LDR r3, [sp, #184] LDR r2, [sp, #176] ADD r1, sp, #0x80 @@ -3067,12 +3067,12 @@ L_curve25519_bits: #else BGE.N L_curve25519_bits #endif - # Cycle Count: 171 + /* Cycle Count: 171 */ LDR r1, [sp, #184] - # Copy + /* Copy */ LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} STM sp, {r4, r5, r6, r7, r8, r9, r10, r11} - # Invert + /* Invert */ ADD r1, sp, #0x0 ADD r0, sp, #0x20 BL fe_sq_op @@ -3252,7 +3252,7 @@ L_curve25519_inv_8: LDR r1, [sp, #176] LDR r0, [sp, #176] BL fe_mul_op - # Ensure result is less than modulus + /* Ensure result is less than modulus */ LDR r0, [sp, #176] LDM r0, {r4, r5, r6, r7, r8, r9, r10, r11} MOV r2, #0x13 @@ -3270,7 +3270,7 @@ L_curve25519_inv_8: MOV r0, #0x0 ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 589 + /* Cycle Count = 589 */ .size curve25519,.-curve25519 #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_CURVE25519 */ @@ -3282,7 +3282,7 @@ L_curve25519_inv_8: fe_invert: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0x88 - # Invert + /* Invert */ STR r0, [sp, #128] STR r1, [sp, #132] LDR r1, [sp, #132] @@ -3464,7 +3464,7 @@ L_fe_invert8: LDR r0, [sp, #128] ADD sp, sp, #0x88 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 292 + /* Cycle Count = 292 */ .size fe_invert,.-fe_invert #ifdef WOLFSSL_SP_NO_UMAAL .text @@ -3475,33 +3475,33 @@ fe_sq2: PUSH {lr} SUB sp, sp, #0x44 STR r0, [sp, #64] - # Square * 2 + /* Square * 2 */ MOV r0, #0x0 LDR r12, [r1] - # A[0] * A[1] + /* A[0] * A[1] */ LDR lr, [r1, #4] UMULL r4, r5, r12, lr - # A[0] * A[3] + /* A[0] * A[3] */ LDR lr, [r1, #12] UMULL r6, r7, r12, lr - # A[0] * A[5] + /* A[0] * A[5] */ LDR lr, [r1, #20] UMULL r8, r9, r12, lr - # A[0] * A[7] + /* A[0] * A[7] */ LDR lr, [r1, #28] UMULL r10, r3, r12, lr - # A[0] * A[2] + /* A[0] * A[2] */ LDR lr, [r1, #8] MOV r11, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[0] * A[4] + /* A[0] * A[4] */ LDR lr, [r1, #16] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[0] * A[6] + /* A[0] * A[6] */ LDR lr, [r1, #24] ADCS r9, r9, #0x0 ADC r11, r0, #0x0 @@ -3510,112 +3510,112 @@ fe_sq2: ADCS r3, r3, #0x0 STR r4, [sp, #4] STR r5, [sp, #8] - # A[1] * A[2] + /* A[1] * A[2] */ LDR r12, [r1, #4] LDR lr, [r1, #8] MOV r11, #0x0 UMLAL r6, r11, r12, lr STR r6, [sp, #12] ADDS r7, r7, r11 - # A[1] * A[3] + /* A[1] * A[3] */ LDR lr, [r1, #12] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr STR r7, [sp, #16] ADDS r8, r8, r11 - # A[1] * A[4] + /* A[1] * A[4] */ LDR lr, [r1, #16] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[1] * A[5] + /* A[1] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[1] * A[6] + /* A[1] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[1] * A[7] + /* A[1] * A[7] */ LDR lr, [r1, #28] ADC r4, r0, #0x0 UMLAL r3, r4, r12, lr - # A[2] * A[3] + /* A[2] * A[3] */ LDR r12, [r1, #8] LDR lr, [r1, #12] MOV r11, #0x0 UMLAL r8, r11, r12, lr STR r8, [sp, #20] ADDS r9, r9, r11 - # A[2] * A[4] + /* A[2] * A[4] */ LDR lr, [r1, #16] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr STR r9, [sp, #24] ADDS r10, r10, r11 - # A[2] * A[5] + /* A[2] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[2] * A[6] + /* A[2] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[2] * A[7] + /* A[2] * A[7] */ LDR lr, [r1, #28] ADC r5, r0, #0x0 UMLAL r4, r5, r12, lr - # A[3] * A[4] + /* A[3] * A[4] */ LDR r12, [r1, #12] LDR lr, [r1, #16] MOV r11, #0x0 UMLAL r10, r11, r12, lr STR r10, [sp, #28] ADDS r3, r3, r11 - # A[3] * A[5] + /* A[3] * A[5] */ LDR lr, [r1, #20] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[3] * A[6] + /* A[3] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[3] * A[7] + /* A[3] * A[7] */ LDR lr, [r1, #28] ADC r6, r0, #0x0 UMLAL r5, r6, r12, lr - # A[4] * A[5] + /* A[4] * A[5] */ LDR r12, [r1, #16] LDR lr, [r1, #20] MOV r11, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[4] * A[6] + /* A[4] * A[6] */ LDR lr, [r1, #24] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[4] * A[7] + /* A[4] * A[7] */ LDR lr, [r1, #28] ADC r7, r0, #0x0 UMLAL r6, r7, r12, lr - # A[5] * A[6] + /* A[5] * A[6] */ LDR r12, [r1, #20] LDR lr, [r1, #24] MOV r11, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[5] * A[7] + /* A[5] * A[7] */ LDR lr, [r1, #28] ADC r8, r0, #0x0 UMLAL r7, r8, r12, lr - # A[6] * A[7] + /* A[6] * A[7] */ LDR r12, [r1, #24] LDR lr, [r1, #28] MOV r9, #0x0 @@ -3645,23 +3645,23 @@ fe_sq2: ADD lr, sp, #0x4 LDM lr, {r4, r5, r6, r7, r8, r9, r10} MOV lr, sp - # A[0] * A[0] + /* A[0] * A[0] */ LDR r12, [r1] UMULL r3, r11, r12, r12 ADDS r4, r4, r11 - # A[1] * A[1] + /* A[1] * A[1] */ LDR r12, [r1, #4] ADCS r5, r5, #0x0 ADC r11, r0, #0x0 UMLAL r5, r11, r12, r12 ADDS r6, r6, r11 - # A[2] * A[2] + /* A[2] * A[2] */ LDR r12, [r1, #8] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, r12 ADDS r8, r8, r11 - # A[3] * A[3] + /* A[3] * A[3] */ LDR r12, [r1, #12] ADCS r9, r9, #0x0 ADC r11, r0, #0x0 @@ -3669,30 +3669,30 @@ fe_sq2: ADDS r10, r10, r11 STM lr!, {r3, r4, r5, r6, r7, r8, r9, r10} LDM lr, {r3, r4, r5, r6, r7, r8, r9, r10} - # A[4] * A[4] + /* A[4] * A[4] */ LDR r12, [r1, #16] ADCS r3, r3, #0x0 ADC r11, r0, #0x0 UMLAL r3, r11, r12, r12 ADDS r4, r4, r11 - # A[5] * A[5] + /* A[5] * A[5] */ LDR r12, [r1, #20] ADCS r5, r5, #0x0 ADC r11, r0, #0x0 UMLAL r5, r11, r12, r12 ADDS r6, r6, r11 - # A[6] * A[6] + /* A[6] * A[6] */ LDR r12, [r1, #24] ADCS r7, r7, #0x0 ADC r11, r0, #0x0 UMLAL r7, r11, r12, r12 ADDS r8, r8, r11 - # A[7] * A[7] + /* A[7] * A[7] */ LDR r12, [r1, #28] ADCS r9, r9, #0x0 ADC r10, r10, #0x0 UMLAL r9, r10, r12, r12 - # Reduce + /* Reduce */ LDR r2, [sp, #28] MOV lr, sp MOV r12, #0x26 @@ -3731,7 +3731,7 @@ fe_sq2: UMLAL r7, r11, r9, r12 BFC r10, #31, #1 ADDS r8, r10, r11 - # Reduce if top bit set + /* Reduce if top bit set */ MOV r12, #0x13 AND r11, r12, r8, ASR #31 ADDS r1, r1, r11 @@ -3743,7 +3743,7 @@ fe_sq2: BFC r8, #31, #1 ADCS r7, r7, #0x0 ADC r8, r8, #0x0 - # Double + /* Double */ ADDS r1, r1, r1 ADCS r2, r2, r2 ADCS r3, r3, r3 @@ -3752,7 +3752,7 @@ fe_sq2: ADCS r6, r6, r6 ADCS r7, r7, r7 ADC r8, r8, r8 - # Reduce if top bit set + /* Reduce if top bit set */ MOV r12, #0x13 AND r11, r12, r8, ASR #31 ADDS r1, r1, r11 @@ -3764,12 +3764,12 @@ fe_sq2: BFC r8, #31, #1 ADCS r7, r7, #0x0 ADC r8, r8, #0x0 - # Store + /* Store */ LDR r0, [sp, #64] STM r0, {r1, r2, r3, r4, r5, r6, r7, r8} ADD sp, sp, #0x44 POP {pc} - # Cycle Count = 385 + /* Cycle Count = 385 */ .size fe_sq2,.-fe_sq2 #else .text @@ -3781,7 +3781,7 @@ fe_sq2: SUB sp, sp, #0x24 STRD r0, r1, [sp, #28] LDM r1, {r0, r1, r2, r3, r4, r5, r6, r7} - # Square * 2 + /* Square * 2 */ UMULL r9, r10, r0, r0 UMULL r11, r12, r0, r1 ADDS r11, r11, r11 @@ -3822,46 +3822,46 @@ fe_sq2: UMAAL r0, r10, r3, r4 ADCS r0, r0, r0 UMAAL r0, r11, lr, lr - # R[7] = r0 + /* R[7] = r0 */ UMAAL r9, r8, r1, r7 UMAAL r9, r10, r2, r6 UMAAL r12, r9, r3, r5 ADCS r12, r12, r12 UMAAL r12, r11, r4, r4 - # R[8] = r12 + /* R[8] = r12 */ UMAAL r9, r8, r2, r7 UMAAL r10, r9, r3, r6 MOV r2, lr UMAAL r10, r2, r4, r5 ADCS r10, r10, r10 UMAAL r11, r10, lr, lr - # R[9] = r11 + /* R[9] = r11 */ UMAAL r2, r8, r3, r7 UMAAL r2, r9, r4, r6 ADCS r3, r2, r2 UMAAL r10, r3, r5, r5 - # R[10] = r10 + /* R[10] = r10 */ MOV r1, lr UMAAL r1, r8, r4, r7 UMAAL r1, r9, r5, r6 ADCS r4, r1, r1 UMAAL r3, r4, lr, lr - # R[11] = r3 + /* R[11] = r3 */ UMAAL r8, r9, r5, r7 ADCS r8, r8, r8 UMAAL r4, r8, r6, r6 - # R[12] = r4 + /* R[12] = r4 */ MOV r5, lr UMAAL r5, r9, r6, r7 ADCS r5, r5, r5 UMAAL r8, r5, lr, lr - # R[13] = r8 + /* R[13] = r8 */ ADCS r9, r9, r9 UMAAL r9, r5, r7, r7 ADCS r7, r5, lr - # R[14] = r9 - # R[15] = r7 - # Reduce + /* R[14] = r9 */ + /* R[15] = r7 */ + /* Reduce */ MOV r6, #0x25 UMAAL r7, r0, r7, r6 MOV r6, #0x13 @@ -3884,7 +3884,7 @@ fe_sq2: BFC r7, #31, #1 UMAAL r6, lr, r9, r12 ADD r7, r7, lr - # Reduce if top bit set + /* Reduce if top bit set */ MOV r11, #0x13 AND r12, r11, r7, ASR #31 ADDS r0, r0, r12 @@ -3896,7 +3896,7 @@ fe_sq2: BFC r7, #31, #1 ADCS r6, r6, #0x0 ADC r7, r7, #0x0 - # Double + /* Double */ ADDS r0, r0, r0 ADCS r1, r1, r1 ADCS r2, r2, r2 @@ -3905,7 +3905,7 @@ fe_sq2: ADCS r5, r5, r5 ADCS r6, r6, r6 ADC r7, r7, r7 - # Reduce if top bit set + /* Reduce if top bit set */ MOV r11, #0x13 AND r12, r11, r7, ASR #31 ADDS r0, r0, r12 @@ -3918,12 +3918,12 @@ fe_sq2: ADCS r6, r6, #0x0 ADC r7, r7, #0x0 POP {r12, lr} - # Store + /* Store */ STM r12, {r0, r1, r2, r3, r4, r5, r6, r7} MOV r0, r12 MOV r1, lr POP {pc} - # Cycle Count = 213 + /* Cycle Count = 213 */ .size fe_sq2,.-fe_sq2 #endif /* WOLFSSL_SP_NO_UMAAL */ .text @@ -3933,7 +3933,7 @@ fe_sq2: fe_pow22523: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0x68 - # pow22523 + /* pow22523 */ STR r0, [sp, #96] STR r1, [sp, #100] LDR r1, [sp, #100] @@ -4115,7 +4115,7 @@ L_fe_pow22523_8: LDR r0, [sp, #96] ADD sp, sp, #0x68 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 293 + /* Cycle Count = 293 */ .size fe_pow22523,.-fe_pow22523 .text .align 4 @@ -4142,7 +4142,7 @@ ge_p1p1_to_p2: BL fe_mul_op ADD sp, sp, #0x8 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 53 + /* Cycle Count = 53 */ .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 .text .align 4 @@ -4174,7 +4174,7 @@ ge_p1p1_to_p3: BL fe_mul_op ADD sp, sp, #0x8 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 63 + /* Cycle Count = 63 */ .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 .text .align 4 @@ -4218,7 +4218,7 @@ ge_p2_dbl: BL fe_sub_op ADD sp, sp, #0x8 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 87 + /* Cycle Count = 87 */ .size ge_p2_dbl,.-ge_p2_dbl .text .align 4 @@ -4264,7 +4264,7 @@ ge_madd: LDR r1, [sp, #4] ADD r1, r1, #0x40 ADD r0, r0, #0x20 - # Double + /* Double */ LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} ADDS r4, r4, r4 ADCS r5, r5, r5 @@ -4290,13 +4290,13 @@ ge_madd: ADCS r10, r10, #0x0 ADC r11, r11, #0x0 STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} - # Done Double + /* Done Double */ ADD r3, r0, #0x20 ADD r1, r0, #0x20 BL fe_add_sub_op ADD sp, sp, #0xc POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 136 + /* Cycle Count = 136 */ .size ge_madd,.-ge_madd .text .align 4 @@ -4342,7 +4342,7 @@ ge_msub: LDR r1, [sp, #4] ADD r1, r1, #0x40 ADD r0, r0, #0x20 - # Double + /* Double */ LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} ADDS r4, r4, r4 ADCS r5, r5, r5 @@ -4368,14 +4368,14 @@ ge_msub: ADCS r10, r10, #0x0 ADC r11, r11, #0x0 STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} - # Done Double + /* Done Double */ ADD r3, r0, #0x20 MOV r1, r0 ADD r0, r0, #0x20 BL fe_add_sub_op ADD sp, sp, #0xc POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 137 + /* Cycle Count = 137 */ .size ge_msub,.-ge_msub .text .align 4 @@ -4416,7 +4416,7 @@ ge_add: BL fe_mul_op LDR r1, [sp] ADD r0, sp, #0xc - # Double + /* Double */ LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} ADDS r4, r4, r4 ADCS r5, r5, r5 @@ -4442,7 +4442,7 @@ ge_add: ADCS r10, r10, #0x0 ADC r11, r11, #0x0 STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} - # Done Double + /* Done Double */ ADD r3, r1, #0x20 ADD r2, r1, #0x40 ADD r0, r1, #0x20 @@ -4454,7 +4454,7 @@ ge_add: BL fe_add_sub_op ADD sp, sp, #0x2c POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 138 + /* Cycle Count = 138 */ .size ge_add,.-ge_add .text .align 4 @@ -4495,7 +4495,7 @@ ge_sub: BL fe_mul_op LDR r1, [sp] ADD r0, sp, #0xc - # Double + /* Double */ LDM r1, {r4, r5, r6, r7, r8, r9, r10, r11} ADDS r4, r4, r4 ADCS r5, r5, r5 @@ -4521,7 +4521,7 @@ ge_sub: ADCS r10, r10, #0x0 ADC r11, r11, #0x0 STM r0, {r4, r5, r6, r7, r8, r9, r10, r11} - # Done Double + /* Done Double */ ADD r3, r1, #0x20 ADD r2, r1, #0x40 ADD r0, r1, #0x20 @@ -4533,7 +4533,7 @@ ge_sub: BL fe_add_sub_op ADD sp, sp, #0x2c POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 138 + /* Cycle Count = 138 */ .size ge_sub,.-ge_sub #ifdef WOLFSSL_SP_NO_UMAAL .text @@ -4544,7 +4544,7 @@ sc_reduce: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0x38 STR r0, [sp, #52] - # Load bits 252-511 + /* Load bits 252-511 */ ADD r0, r0, #0x1c LDM r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} LSR lr, r9, #24 @@ -4566,7 +4566,7 @@ sc_reduce: ORR r2, r2, r1, LSR #28 BFC r9, #28, #4 SUB r0, r0, #0x1c - # Add order times bits 504..511 + /* Add order times bits 504..511 */ MOV r10, #0x2c13 MOVT r10, #0xa30a MOV r11, #0x9ce5 @@ -4597,7 +4597,7 @@ sc_reduce: SBCS r7, r7, #0x0 SBCS r8, r8, #0x0 SBC r9, r9, #0x0 - # Sub product of top 8 words and order + /* Sub product of top 8 words and order */ MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a @@ -4765,7 +4765,7 @@ sc_reduce: UMLAL r11, lr, r9, r1 STM r12!, {r10, r11, lr} SUB r12, r12, #0x20 - # Subtract at 4 * 32 + /* Subtract at 4 * 32 */ LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 @@ -4784,7 +4784,7 @@ sc_reduce: STM r12!, {r10, r11} SUB r12, r12, #0x24 ASR lr, r11, #25 - # Conditionally subtract order starting at bit 125 + /* Conditionally subtract order starting at bit 125 */ MOV r1, #0xa0000000 MOV r2, #0xba7d MOVT r2, #0x4b9e @@ -4822,7 +4822,7 @@ sc_reduce: STM r12!, {r10} SUB r0, r0, #0x10 MOV r12, sp - # Load bits 252-376 + /* Load bits 252-376 */ ADD r12, r12, #0x1c LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 @@ -4835,9 +4835,9 @@ sc_reduce: ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 SUB r12, r12, #0x1c - # Sub product of top 4 words and order + /* Sub product of top 4 words and order */ MOV r0, sp - # * -5cf5d3ed + /* * -5cf5d3ed */ MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 @@ -4857,7 +4857,7 @@ sc_reduce: UMLAL r9, lr, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -5812631b + /* * -5812631b */ MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 @@ -4877,7 +4877,7 @@ sc_reduce: UMLAL r9, r10, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -a2f79cd7 + /* * -a2f79cd7 */ MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 @@ -4897,7 +4897,7 @@ sc_reduce: UMLAL r9, r11, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -14def9df + /* * -14def9df */ MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 @@ -4917,14 +4917,14 @@ sc_reduce: UMLAL r9, r12, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # Add overflows at 4 * 32 + /* Add overflows at 4 * 32 */ LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 ADCS r8, r8, r11 ADC r9, r9, r12 - # Subtract top at 4 * 32 + /* Subtract top at 4 * 32 */ SUBS r6, r6, r2 SBCS r7, r7, r3 SBCS r8, r8, r4 @@ -4954,12 +4954,12 @@ sc_reduce: ADCS r8, r8, #0x0 ADC r9, r9, r1 BFC r9, #28, #4 - # Store result + /* Store result */ LDR r0, [sp, #52] STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} ADD sp, sp, #0x38 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 588 + /* Cycle Count = 588 */ .size sc_reduce,.-sc_reduce #else .text @@ -4970,7 +4970,7 @@ sc_reduce: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0x38 STR r0, [sp, #52] - # Load bits 252-511 + /* Load bits 252-511 */ ADD r0, r0, #0x1c LDM r0, {r1, r2, r3, r4, r5, r6, r7, r8, r9} LSR lr, r9, #24 @@ -4992,7 +4992,7 @@ sc_reduce: ORR r2, r2, r1, LSR #28 BFC r9, #28, #4 SUB r0, r0, #0x1c - # Add order times bits 504..511 + /* Add order times bits 504..511 */ MOV r10, #0x2c13 MOVT r10, #0xa30a MOV r11, #0x9ce5 @@ -5014,7 +5014,7 @@ sc_reduce: SBCS r7, r7, #0x0 SBCS r8, r8, #0x0 SBC r9, r9, #0x0 - # Sub product of top 8 words and order + /* Sub product of top 8 words and order */ MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a @@ -5098,7 +5098,7 @@ sc_reduce: UMAAL r11, lr, r9, r1 STM r12!, {r10, r11, lr} SUB r12, r12, #0x20 - # Subtract at 4 * 32 + /* Subtract at 4 * 32 */ LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 @@ -5117,7 +5117,7 @@ sc_reduce: STM r12!, {r10, r11} SUB r12, r12, #0x24 ASR lr, r11, #25 - # Conditionally subtract order starting at bit 125 + /* Conditionally subtract order starting at bit 125 */ MOV r1, #0xa0000000 MOV r2, #0xba7d MOVT r2, #0x4b9e @@ -5155,7 +5155,7 @@ sc_reduce: STM r12!, {r10} SUB r0, r0, #0x10 MOV r12, sp - # Load bits 252-376 + /* Load bits 252-376 */ ADD r12, r12, #0x1c LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 @@ -5168,9 +5168,9 @@ sc_reduce: ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 SUB r12, r12, #0x1c - # Sub product of top 4 words and order + /* Sub product of top 4 words and order */ MOV r0, sp - # * -5cf5d3ed + /* * -5cf5d3ed */ MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 @@ -5181,7 +5181,7 @@ sc_reduce: UMAAL r9, lr, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -5812631b + /* * -5812631b */ MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 @@ -5192,7 +5192,7 @@ sc_reduce: UMAAL r9, r10, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -a2f79cd7 + /* * -a2f79cd7 */ MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 @@ -5203,7 +5203,7 @@ sc_reduce: UMAAL r9, r11, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -14def9df + /* * -14def9df */ MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 @@ -5214,14 +5214,14 @@ sc_reduce: UMAAL r9, r12, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # Add overflows at 4 * 32 + /* Add overflows at 4 * 32 */ LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 ADCS r8, r8, r11 ADC r9, r9, r12 - # Subtract top at 4 * 32 + /* Subtract top at 4 * 32 */ SUBS r6, r6, r2 SBCS r7, r7, r3 SBCS r8, r8, r4 @@ -5251,12 +5251,12 @@ sc_reduce: ADCS r8, r8, #0x0 ADC r9, r9, r1 BFC r9, #28, #4 - # Store result + /* Store result */ LDR r0, [sp, #52] STM r0, {r2, r3, r4, r5, r6, r7, r8, r9} ADD sp, sp, #0x38 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 502 + /* Cycle Count = 502 */ .size sc_reduce,.-sc_reduce #endif /* WOLFSSL_SP_NO_UMAAL */ #ifdef HAVE_ED25519_SIGN @@ -5272,332 +5272,332 @@ sc_muladd: STM lr, {r0, r1, r3} MOV r0, #0x0 LDR r12, [r1] - # A[0] * B[0] + /* A[0] * B[0] */ LDR lr, [r2] UMULL r3, r4, r12, lr - # A[0] * B[2] + /* A[0] * B[2] */ LDR lr, [r2, #8] UMULL r5, r6, r12, lr - # A[0] * B[4] + /* A[0] * B[4] */ LDR lr, [r2, #16] UMULL r7, r8, r12, lr - # A[0] * B[6] + /* A[0] * B[6] */ LDR lr, [r2, #24] UMULL r9, r10, r12, lr STR r3, [sp] - # A[0] * B[1] + /* A[0] * B[1] */ LDR lr, [r2, #4] MOV r11, r0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[0] * B[3] + /* A[0] * B[3] */ LDR lr, [r2, #12] ADCS r6, r6, #0x0 ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[0] * B[5] + /* A[0] * B[5] */ LDR lr, [r2, #20] ADCS r8, r8, #0x0 ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[0] * B[7] + /* A[0] * B[7] */ LDR lr, [r2, #28] ADCS r10, r10, #0x0 ADC r3, r0, #0x0 UMLAL r10, r3, r12, lr - # A[1] * B[0] + /* A[1] * B[0] */ LDR r12, [r1, #4] LDR lr, [r2] MOV r11, #0x0 UMLAL r4, r11, r12, lr STR r4, [sp, #4] ADDS r5, r5, r11 - # A[1] * B[1] + /* A[1] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[1] * B[2] + /* A[1] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[1] * B[3] + /* A[1] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[1] * B[4] + /* A[1] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[1] * B[5] + /* A[1] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[1] * B[6] + /* A[1] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[1] * B[7] + /* A[1] * B[7] */ LDR lr, [r2, #28] ADC r4, r0, #0x0 UMLAL r3, r4, r12, lr - # A[2] * B[0] + /* A[2] * B[0] */ LDR r12, [r1, #8] LDR lr, [r2] MOV r11, #0x0 UMLAL r5, r11, r12, lr STR r5, [sp, #8] ADDS r6, r6, r11 - # A[2] * B[1] + /* A[2] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[2] * B[2] + /* A[2] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[2] * B[3] + /* A[2] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[2] * B[4] + /* A[2] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[2] * B[5] + /* A[2] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[2] * B[6] + /* A[2] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[2] * B[7] + /* A[2] * B[7] */ LDR lr, [r2, #28] ADC r5, r0, #0x0 UMLAL r4, r5, r12, lr - # A[3] * B[0] + /* A[3] * B[0] */ LDR r12, [r1, #12] LDR lr, [r2] MOV r11, #0x0 UMLAL r6, r11, r12, lr STR r6, [sp, #12] ADDS r7, r7, r11 - # A[3] * B[1] + /* A[3] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[3] * B[2] + /* A[3] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[3] * B[3] + /* A[3] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[3] * B[4] + /* A[3] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[3] * B[5] + /* A[3] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[3] * B[6] + /* A[3] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[3] * B[7] + /* A[3] * B[7] */ LDR lr, [r2, #28] ADC r6, r0, #0x0 UMLAL r5, r6, r12, lr - # A[4] * B[0] + /* A[4] * B[0] */ LDR r12, [r1, #16] LDR lr, [r2] MOV r11, #0x0 UMLAL r7, r11, r12, lr STR r7, [sp, #16] ADDS r8, r8, r11 - # A[4] * B[1] + /* A[4] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[4] * B[2] + /* A[4] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[4] * B[3] + /* A[4] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[4] * B[4] + /* A[4] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[4] * B[5] + /* A[4] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[4] * B[6] + /* A[4] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[4] * B[7] + /* A[4] * B[7] */ LDR lr, [r2, #28] ADC r7, r0, #0x0 UMLAL r6, r7, r12, lr - # A[5] * B[0] + /* A[5] * B[0] */ LDR r12, [r1, #20] LDR lr, [r2] MOV r11, #0x0 UMLAL r8, r11, r12, lr STR r8, [sp, #20] ADDS r9, r9, r11 - # A[5] * B[1] + /* A[5] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r9, r11, r12, lr ADDS r10, r10, r11 - # A[5] * B[2] + /* A[5] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[5] * B[3] + /* A[5] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[5] * B[4] + /* A[5] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[5] * B[5] + /* A[5] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[5] * B[6] + /* A[5] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[5] * B[7] + /* A[5] * B[7] */ LDR lr, [r2, #28] ADC r8, r0, #0x0 UMLAL r7, r8, r12, lr - # A[6] * B[0] + /* A[6] * B[0] */ LDR r12, [r1, #24] LDR lr, [r2] MOV r11, #0x0 UMLAL r9, r11, r12, lr STR r9, [sp, #24] ADDS r10, r10, r11 - # A[6] * B[1] + /* A[6] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r10, r11, r12, lr ADDS r3, r3, r11 - # A[6] * B[2] + /* A[6] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[6] * B[3] + /* A[6] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[6] * B[4] + /* A[6] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[6] * B[5] + /* A[6] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[6] * B[6] + /* A[6] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[6] * B[7] + /* A[6] * B[7] */ LDR lr, [r2, #28] ADC r9, r0, #0x0 UMLAL r8, r9, r12, lr - # A[7] * B[0] + /* A[7] * B[0] */ LDR r12, [r1, #28] LDR lr, [r2] MOV r11, #0x0 UMLAL r10, r11, r12, lr STR r10, [sp, #28] ADDS r3, r3, r11 - # A[7] * B[1] + /* A[7] * B[1] */ LDR lr, [r2, #4] ADC r11, r0, #0x0 UMLAL r3, r11, r12, lr ADDS r4, r4, r11 - # A[7] * B[2] + /* A[7] * B[2] */ LDR lr, [r2, #8] ADC r11, r0, #0x0 UMLAL r4, r11, r12, lr ADDS r5, r5, r11 - # A[7] * B[3] + /* A[7] * B[3] */ LDR lr, [r2, #12] ADC r11, r0, #0x0 UMLAL r5, r11, r12, lr ADDS r6, r6, r11 - # A[7] * B[4] + /* A[7] * B[4] */ LDR lr, [r2, #16] ADC r11, r0, #0x0 UMLAL r6, r11, r12, lr ADDS r7, r7, r11 - # A[7] * B[5] + /* A[7] * B[5] */ LDR lr, [r2, #20] ADC r11, r0, #0x0 UMLAL r7, r11, r12, lr ADDS r8, r8, r11 - # A[7] * B[6] + /* A[7] * B[6] */ LDR lr, [r2, #24] ADC r11, r0, #0x0 UMLAL r8, r11, r12, lr ADDS r9, r9, r11 - # A[7] * B[7] + /* A[7] * B[7] */ LDR lr, [r2, #28] ADC r10, r0, #0x0 UMLAL r9, r10, r12, lr ADD lr, sp, #0x20 STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} MOV r0, sp - # Add c to a * b + /* Add c to a * b */ LDR lr, [sp, #76] LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} LDM lr!, {r1, r10, r11, r12} @@ -5622,7 +5622,7 @@ sc_muladd: ADCS r8, r8, #0x0 ADC r9, r9, #0x0 SUB r0, r0, #0x20 - # Get 252..503 and 504..507 + /* Get 252..503 and 504..507 */ LSR lr, r9, #24 LSL r9, r9, #4 ORR r9, r9, r8, LSR #28 @@ -5641,7 +5641,7 @@ sc_muladd: LSL r2, r2, #4 ORR r2, r2, r1, LSR #28 BFC r9, #28, #4 - # Add order times bits 504..507 + /* Add order times bits 504..507 */ MOV r10, #0x2c13 MOVT r10, #0xa30a MOV r11, #0x9ce5 @@ -5672,7 +5672,7 @@ sc_muladd: SBCS r7, r7, #0x0 SBCS r8, r8, #0x0 SBC r9, r9, #0x0 - # Sub product of top 8 words and order + /* Sub product of top 8 words and order */ MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a @@ -5840,7 +5840,7 @@ sc_muladd: UMLAL r11, lr, r9, r1 STM r12!, {r10, r11, lr} SUB r12, r12, #0x20 - # Subtract at 4 * 32 + /* Subtract at 4 * 32 */ LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 @@ -5859,7 +5859,7 @@ sc_muladd: STM r12!, {r10, r11} SUB r12, r12, #0x24 ASR lr, r11, #25 - # Conditionally subtract order starting at bit 125 + /* Conditionally subtract order starting at bit 125 */ MOV r1, #0xa0000000 MOV r2, #0xba7d MOVT r2, #0x4b9e @@ -5897,7 +5897,7 @@ sc_muladd: STM r12!, {r10} SUB r0, r0, #0x10 MOV r12, sp - # Load bits 252-376 + /* Load bits 252-376 */ ADD r12, r12, #0x1c LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 @@ -5910,9 +5910,9 @@ sc_muladd: ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 SUB r12, r12, #0x1c - # Sub product of top 4 words and order + /* Sub product of top 4 words and order */ MOV r0, sp - # * -5cf5d3ed + /* * -5cf5d3ed */ MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 @@ -5932,7 +5932,7 @@ sc_muladd: UMLAL r9, lr, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -5812631b + /* * -5812631b */ MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 @@ -5952,7 +5952,7 @@ sc_muladd: UMLAL r9, r10, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -a2f79cd7 + /* * -a2f79cd7 */ MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 @@ -5972,7 +5972,7 @@ sc_muladd: UMLAL r9, r11, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -14def9df + /* * -14def9df */ MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 @@ -5992,14 +5992,14 @@ sc_muladd: UMLAL r9, r12, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # Add overflows at 4 * 32 + /* Add overflows at 4 * 32 */ LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 ADCS r8, r8, r11 ADC r9, r9, r12 - # Subtract top at 4 * 32 + /* Subtract top at 4 * 32 */ SUBS r6, r6, r2 SBCS r7, r7, r3 SBCS r8, r8, r4 @@ -6030,7 +6030,7 @@ sc_muladd: ADC r9, r9, r1 BFC r9, #28, #4 LDR r0, [sp, #68] - # Store result + /* Store result */ STR r2, [r0] STR r3, [r0, #4] STR r4, [r0, #8] @@ -6041,7 +6041,7 @@ sc_muladd: STR r9, [r0, #28] ADD sp, sp, #0x50 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 994 + /* Cycle Count = 994 */ .size sc_muladd,.-sc_muladd #else .text @@ -6153,7 +6153,7 @@ sc_muladd: ADD lr, sp, #0x20 STM lr, {r3, r4, r5, r6, r7, r8, r9, r10} MOV r0, sp - # Add c to a * b + /* Add c to a * b */ LDR lr, [sp, #76] LDM r0, {r2, r3, r4, r5, r6, r7, r8, r9} LDM lr!, {r1, r10, r11, r12} @@ -6178,7 +6178,7 @@ sc_muladd: ADCS r8, r8, #0x0 ADC r9, r9, #0x0 SUB r0, r0, #0x20 - # Get 252..503 and 504..507 + /* Get 252..503 and 504..507 */ LSR lr, r9, #24 LSL r9, r9, #4 ORR r9, r9, r8, LSR #28 @@ -6197,7 +6197,7 @@ sc_muladd: LSL r2, r2, #4 ORR r2, r2, r1, LSR #28 BFC r9, #28, #4 - # Add order times bits 504..507 + /* Add order times bits 504..507 */ MOV r10, #0x2c13 MOVT r10, #0xa30a MOV r11, #0x9ce5 @@ -6219,7 +6219,7 @@ sc_muladd: SBCS r7, r7, #0x0 SBCS r8, r8, #0x0 SBC r9, r9, #0x0 - # Sub product of top 8 words and order + /* Sub product of top 8 words and order */ MOV r12, sp MOV r1, #0x2c13 MOVT r1, #0xa30a @@ -6303,7 +6303,7 @@ sc_muladd: UMAAL r11, lr, r9, r1 STM r12!, {r10, r11, lr} SUB r12, r12, #0x20 - # Subtract at 4 * 32 + /* Subtract at 4 * 32 */ LDM r12, {r10, r11} SUBS r10, r10, r2 SBCS r11, r11, r3 @@ -6322,7 +6322,7 @@ sc_muladd: STM r12!, {r10, r11} SUB r12, r12, #0x24 ASR lr, r11, #25 - # Conditionally subtract order starting at bit 125 + /* Conditionally subtract order starting at bit 125 */ MOV r1, #0xa0000000 MOV r2, #0xba7d MOVT r2, #0x4b9e @@ -6360,7 +6360,7 @@ sc_muladd: STM r12!, {r10} SUB r0, r0, #0x10 MOV r12, sp - # Load bits 252-376 + /* Load bits 252-376 */ ADD r12, r12, #0x1c LDM r12, {r1, r2, r3, r4, r5} LSL r5, r5, #4 @@ -6373,9 +6373,9 @@ sc_muladd: ORR r2, r2, r1, LSR #28 BFC r5, #29, #3 SUB r12, r12, #0x1c - # Sub product of top 4 words and order + /* Sub product of top 4 words and order */ MOV r0, sp - # * -5cf5d3ed + /* * -5cf5d3ed */ MOV r1, #0x2c13 MOVT r1, #0xa30a MOV lr, #0x0 @@ -6386,7 +6386,7 @@ sc_muladd: UMAAL r9, lr, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -5812631b + /* * -5812631b */ MOV r1, #0x9ce5 MOVT r1, #0xa7ed MOV r10, #0x0 @@ -6397,7 +6397,7 @@ sc_muladd: UMAAL r9, r10, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -a2f79cd7 + /* * -a2f79cd7 */ MOV r1, #0x6329 MOVT r1, #0x5d08 MOV r11, #0x0 @@ -6408,7 +6408,7 @@ sc_muladd: UMAAL r9, r11, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # * -14def9df + /* * -14def9df */ MOV r1, #0x621 MOVT r1, #0xeb21 MOV r12, #0x0 @@ -6419,14 +6419,14 @@ sc_muladd: UMAAL r9, r12, r5, r1 STM r0, {r6, r7, r8, r9} ADD r0, r0, #0x4 - # Add overflows at 4 * 32 + /* Add overflows at 4 * 32 */ LDM r0, {r6, r7, r8, r9} BFC r9, #28, #4 ADDS r6, r6, lr ADCS r7, r7, r10 ADCS r8, r8, r11 ADC r9, r9, r12 - # Subtract top at 4 * 32 + /* Subtract top at 4 * 32 */ SUBS r6, r6, r2 SBCS r7, r7, r3 SBCS r8, r8, r4 @@ -6457,7 +6457,7 @@ sc_muladd: ADC r9, r9, r1 BFC r9, #28, #4 LDR r0, [sp, #68] - # Store result + /* Store result */ STR r2, [r0] STR r3, [r0, #4] STR r4, [r0, #8] @@ -6468,7 +6468,7 @@ sc_muladd: STR r9, [r0, #28] ADD sp, sp, #0x50 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 752 + /* Cycle Count = 752 */ .size sc_muladd,.-sc_muladd #endif /* WOLFSSL_SP_NO_UMAAL */ #endif /* HAVE_ED25519_SIGN */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index edb2af068..a5403e99e 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -39,7 +39,7 @@ #ifdef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__arm__) +#if !defined(__aarch64__) && defined(__thumb__) #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm @@ -2796,9 +2796,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "MOV %[a], #0x1c\n\t" "STR %[a], [sp, #176]\n\t" "\n" - "L_curve25519_words_%=:\n\t" + "L_curve25519_words:\n\t" "\n" - "L_curve25519_bits_%=:\n\t" + "L_curve25519_bits:\n\t" "LDR %[n], [sp, #164]\n\t" "LDR %[a], [%[n], r2]\n\t" "LDR %[n], [sp, #180]\n\t" @@ -2978,19 +2978,19 @@ int curve25519(byte* r, const byte* n, const byte* a) "LDR %[n], [sp, #180]\n\t" "SUBS %[n], %[n], #0x1\n\t" "STR %[n], [sp, #180]\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGE L_curve25519_bits_%=\n\t" +#ifdef __GNUC__ + "BGE L_curve25519_bits\n\t" #else - "BGE.N L_curve25519_bits_%=\n\t" + "BGE.W L_curve25519_bits\n\t" #endif "MOV %[n], #0x1f\n\t" "STR %[n], [sp, #180]\n\t" "SUBS %[a], %[a], #0x4\n\t" "STR %[a], [sp, #176]\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGE L_curve25519_words_%=\n\t" +#ifdef __GNUC__ + "BGE L_curve25519_words\n\t" #else - "BGE.N L_curve25519_words_%=\n\t" + "BGE.W L_curve25519_words\n\t" #endif /* Invert */ "ADD r1, sp, #0x0\n\t" @@ -3022,7 +3022,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x4\n\t" "\n" - "L_curve25519_inv_1_%=:\n\t" + "L_curve25519_inv_1:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3030,9 +3030,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_1_%=\n\t" + "BNE L_curve25519_inv_1\n\t" #else - "BNE.N L_curve25519_inv_1_%=\n\t" + "BNE.N L_curve25519_inv_1\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3043,7 +3043,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x9\n\t" "\n" - "L_curve25519_inv_2_%=:\n\t" + "L_curve25519_inv_2:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3051,9 +3051,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_2_%=\n\t" + "BNE L_curve25519_inv_2\n\t" #else - "BNE.N L_curve25519_inv_2_%=\n\t" + "BNE.N L_curve25519_inv_2\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3064,7 +3064,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x13\n\t" "\n" - "L_curve25519_inv_3_%=:\n\t" + "L_curve25519_inv_3:\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x80\n\t" "PUSH {r12}\n\t" @@ -3072,9 +3072,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_3_%=\n\t" + "BNE L_curve25519_inv_3\n\t" #else - "BNE.N L_curve25519_inv_3_%=\n\t" + "BNE.N L_curve25519_inv_3\n\t" #endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" @@ -3082,7 +3082,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0xa\n\t" "\n" - "L_curve25519_inv_4_%=:\n\t" + "L_curve25519_inv_4:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3090,9 +3090,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_4_%=\n\t" + "BNE L_curve25519_inv_4\n\t" #else - "BNE.N L_curve25519_inv_4_%=\n\t" + "BNE.N L_curve25519_inv_4\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3103,7 +3103,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x31\n\t" "\n" - "L_curve25519_inv_5_%=:\n\t" + "L_curve25519_inv_5:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3111,9 +3111,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_5_%=\n\t" + "BNE L_curve25519_inv_5\n\t" #else - "BNE.N L_curve25519_inv_5_%=\n\t" + "BNE.N L_curve25519_inv_5\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3124,7 +3124,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x63\n\t" "\n" - "L_curve25519_inv_6_%=:\n\t" + "L_curve25519_inv_6:\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x80\n\t" "PUSH {r12}\n\t" @@ -3132,9 +3132,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_6_%=\n\t" + "BNE L_curve25519_inv_6\n\t" #else - "BNE.N L_curve25519_inv_6_%=\n\t" + "BNE.N L_curve25519_inv_6\n\t" #endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" @@ -3142,7 +3142,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0x32\n\t" "\n" - "L_curve25519_inv_7_%=:\n\t" + "L_curve25519_inv_7:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3150,9 +3150,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_7_%=\n\t" + "BNE L_curve25519_inv_7\n\t" #else - "BNE.N L_curve25519_inv_7_%=\n\t" + "BNE.N L_curve25519_inv_7\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3160,7 +3160,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0x5\n\t" "\n" - "L_curve25519_inv_8_%=:\n\t" + "L_curve25519_inv_8:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3168,9 +3168,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_8_%=\n\t" + "BNE L_curve25519_inv_8\n\t" #else - "BNE.N L_curve25519_inv_8_%=\n\t" + "BNE.N L_curve25519_inv_8\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3234,7 +3234,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "STM r3, {r4, r5, r6, r7, r8, r9, r10, r11}\n\t" "MOV %[a], #0xfe\n\t" "\n" - "L_curve25519_bits_%=:\n\t" + "L_curve25519_bits:\n\t" "STR %[a], [sp, #168]\n\t" "LDR %[n], [sp, #160]\n\t" "AND r4, %[a], #0x1f\n\t" @@ -3320,9 +3320,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "LDR %[a], [sp, #168]\n\t" "SUBS %[a], %[a], #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGE L_curve25519_bits_%=\n\t" + "BGE L_curve25519_bits\n\t" #else - "BGE.N L_curve25519_bits_%=\n\t" + "BGE.N L_curve25519_bits\n\t" #endif /* Cycle Count: 171 */ "LDR %[n], [sp, #184]\n\t" @@ -3359,7 +3359,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x4\n\t" "\n" - "L_curve25519_inv_1_%=:\n\t" + "L_curve25519_inv_1:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3367,9 +3367,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_1_%=\n\t" + "BNE L_curve25519_inv_1\n\t" #else - "BNE.N L_curve25519_inv_1_%=\n\t" + "BNE.N L_curve25519_inv_1\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3380,7 +3380,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x9\n\t" "\n" - "L_curve25519_inv_2_%=:\n\t" + "L_curve25519_inv_2:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3388,9 +3388,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_2_%=\n\t" + "BNE L_curve25519_inv_2\n\t" #else - "BNE.N L_curve25519_inv_2_%=\n\t" + "BNE.N L_curve25519_inv_2\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3401,7 +3401,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x13\n\t" "\n" - "L_curve25519_inv_3_%=:\n\t" + "L_curve25519_inv_3:\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x80\n\t" "PUSH {r12}\n\t" @@ -3409,9 +3409,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_3_%=\n\t" + "BNE L_curve25519_inv_3\n\t" #else - "BNE.N L_curve25519_inv_3_%=\n\t" + "BNE.N L_curve25519_inv_3\n\t" #endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" @@ -3419,7 +3419,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0xa\n\t" "\n" - "L_curve25519_inv_4_%=:\n\t" + "L_curve25519_inv_4:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3427,9 +3427,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_4_%=\n\t" + "BNE L_curve25519_inv_4\n\t" #else - "BNE.N L_curve25519_inv_4_%=\n\t" + "BNE.N L_curve25519_inv_4\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3440,7 +3440,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x31\n\t" "\n" - "L_curve25519_inv_5_%=:\n\t" + "L_curve25519_inv_5:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3448,9 +3448,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_5_%=\n\t" + "BNE L_curve25519_inv_5\n\t" #else - "BNE.N L_curve25519_inv_5_%=\n\t" + "BNE.N L_curve25519_inv_5\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3461,7 +3461,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_sq_op\n\t" "MOV r12, #0x63\n\t" "\n" - "L_curve25519_inv_6_%=:\n\t" + "L_curve25519_inv_6:\n\t" "ADD r1, sp, #0x80\n\t" "ADD r0, sp, #0x80\n\t" "PUSH {r12}\n\t" @@ -3469,9 +3469,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_6_%=\n\t" + "BNE L_curve25519_inv_6\n\t" #else - "BNE.N L_curve25519_inv_6_%=\n\t" + "BNE.N L_curve25519_inv_6\n\t" #endif "ADD r2, sp, #0x60\n\t" "ADD r1, sp, #0x80\n\t" @@ -3479,7 +3479,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0x32\n\t" "\n" - "L_curve25519_inv_7_%=:\n\t" + "L_curve25519_inv_7:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3487,9 +3487,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_7_%=\n\t" + "BNE L_curve25519_inv_7\n\t" #else - "BNE.N L_curve25519_inv_7_%=\n\t" + "BNE.N L_curve25519_inv_7\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3497,7 +3497,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "BL fe_mul_op\n\t" "MOV r12, #0x5\n\t" "\n" - "L_curve25519_inv_8_%=:\n\t" + "L_curve25519_inv_8:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3505,9 +3505,9 @@ int curve25519(byte* r, const byte* n, const byte* a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_curve25519_inv_8_%=\n\t" + "BNE L_curve25519_inv_8\n\t" #else - "BNE.N L_curve25519_inv_8_%=\n\t" + "BNE.N L_curve25519_inv_8\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3589,7 +3589,7 @@ void fe_invert(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x4\n\t" "\n" - "L_fe_invert1_%=:\n\t" + "L_fe_invert1:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3597,9 +3597,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert1_%=\n\t" + "BNE L_fe_invert1\n\t" #else - "BNE.N L_fe_invert1_%=\n\t" + "BNE.N L_fe_invert1\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3610,7 +3610,7 @@ void fe_invert(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x9\n\t" "\n" - "L_fe_invert2_%=:\n\t" + "L_fe_invert2:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3618,9 +3618,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert2_%=\n\t" + "BNE L_fe_invert2\n\t" #else - "BNE.N L_fe_invert2_%=\n\t" + "BNE.N L_fe_invert2\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3631,7 +3631,7 @@ void fe_invert(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x13\n\t" "\n" - "L_fe_invert3_%=:\n\t" + "L_fe_invert3:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3639,9 +3639,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert3_%=\n\t" + "BNE L_fe_invert3\n\t" #else - "BNE.N L_fe_invert3_%=\n\t" + "BNE.N L_fe_invert3\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3649,7 +3649,7 @@ void fe_invert(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0xa\n\t" "\n" - "L_fe_invert4_%=:\n\t" + "L_fe_invert4:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3657,9 +3657,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert4_%=\n\t" + "BNE L_fe_invert4\n\t" #else - "BNE.N L_fe_invert4_%=\n\t" + "BNE.N L_fe_invert4\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3670,7 +3670,7 @@ void fe_invert(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x31\n\t" "\n" - "L_fe_invert5_%=:\n\t" + "L_fe_invert5:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3678,9 +3678,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert5_%=\n\t" + "BNE L_fe_invert5\n\t" #else - "BNE.N L_fe_invert5_%=\n\t" + "BNE.N L_fe_invert5\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3691,7 +3691,7 @@ void fe_invert(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x63\n\t" "\n" - "L_fe_invert6_%=:\n\t" + "L_fe_invert6:\n\t" "ADD r1, sp, #0x60\n\t" "ADD r0, sp, #0x60\n\t" "PUSH {r12}\n\t" @@ -3699,9 +3699,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert6_%=\n\t" + "BNE L_fe_invert6\n\t" #else - "BNE.N L_fe_invert6_%=\n\t" + "BNE.N L_fe_invert6\n\t" #endif "ADD r2, sp, #0x40\n\t" "ADD r1, sp, #0x60\n\t" @@ -3709,7 +3709,7 @@ void fe_invert(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0x32\n\t" "\n" - "L_fe_invert7_%=:\n\t" + "L_fe_invert7:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -3717,9 +3717,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert7_%=\n\t" + "BNE L_fe_invert7\n\t" #else - "BNE.N L_fe_invert7_%=\n\t" + "BNE.N L_fe_invert7\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -3727,7 +3727,7 @@ void fe_invert(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0x5\n\t" "\n" - "L_fe_invert8_%=:\n\t" + "L_fe_invert8:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -3735,9 +3735,9 @@ void fe_invert(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_invert8_%=\n\t" + "BNE L_fe_invert8\n\t" #else - "BNE.N L_fe_invert8_%=\n\t" + "BNE.N L_fe_invert8\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4275,7 +4275,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x4\n\t" "\n" - "L_fe_pow22523_1_%=:\n\t" + "L_fe_pow22523_1:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -4283,9 +4283,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_1_%=\n\t" + "BNE L_fe_pow22523_1\n\t" #else - "BNE.N L_fe_pow22523_1_%=\n\t" + "BNE.N L_fe_pow22523_1\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4296,7 +4296,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x9\n\t" "\n" - "L_fe_pow22523_2_%=:\n\t" + "L_fe_pow22523_2:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -4304,9 +4304,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_2_%=\n\t" + "BNE L_fe_pow22523_2\n\t" #else - "BNE.N L_fe_pow22523_2_%=\n\t" + "BNE.N L_fe_pow22523_2\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4317,7 +4317,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x13\n\t" "\n" - "L_fe_pow22523_3_%=:\n\t" + "L_fe_pow22523_3:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -4325,9 +4325,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_3_%=\n\t" + "BNE L_fe_pow22523_3\n\t" #else - "BNE.N L_fe_pow22523_3_%=\n\t" + "BNE.N L_fe_pow22523_3\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -4335,7 +4335,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0xa\n\t" "\n" - "L_fe_pow22523_4_%=:\n\t" + "L_fe_pow22523_4:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -4343,9 +4343,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_4_%=\n\t" + "BNE L_fe_pow22523_4\n\t" #else - "BNE.N L_fe_pow22523_4_%=\n\t" + "BNE.N L_fe_pow22523_4\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4356,7 +4356,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x31\n\t" "\n" - "L_fe_pow22523_5_%=:\n\t" + "L_fe_pow22523_5:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -4364,9 +4364,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_5_%=\n\t" + "BNE L_fe_pow22523_5\n\t" #else - "BNE.N L_fe_pow22523_5_%=\n\t" + "BNE.N L_fe_pow22523_5\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4377,7 +4377,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_sq_op\n\t" "MOV r12, #0x63\n\t" "\n" - "L_fe_pow22523_6_%=:\n\t" + "L_fe_pow22523_6:\n\t" "ADD r1, sp, #0x40\n\t" "ADD r0, sp, #0x40\n\t" "PUSH {r12}\n\t" @@ -4385,9 +4385,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_6_%=\n\t" + "BNE L_fe_pow22523_6\n\t" #else - "BNE.N L_fe_pow22523_6_%=\n\t" + "BNE.N L_fe_pow22523_6\n\t" #endif "ADD r2, sp, #0x20\n\t" "ADD r1, sp, #0x40\n\t" @@ -4395,7 +4395,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0x32\n\t" "\n" - "L_fe_pow22523_7_%=:\n\t" + "L_fe_pow22523_7:\n\t" "ADD r1, sp, #0x20\n\t" "ADD r0, sp, #0x20\n\t" "PUSH {r12}\n\t" @@ -4403,9 +4403,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_7_%=\n\t" + "BNE L_fe_pow22523_7\n\t" #else - "BNE.N L_fe_pow22523_7_%=\n\t" + "BNE.N L_fe_pow22523_7\n\t" #endif "MOV r2, sp\n\t" "ADD r1, sp, #0x20\n\t" @@ -4413,7 +4413,7 @@ void fe_pow22523(fe r, const fe a) "BL fe_mul_op\n\t" "MOV r12, #0x2\n\t" "\n" - "L_fe_pow22523_8_%=:\n\t" + "L_fe_pow22523_8:\n\t" "MOV r1, sp\n\t" "MOV r0, sp\n\t" "PUSH {r12}\n\t" @@ -4421,9 +4421,9 @@ void fe_pow22523(fe r, const fe a) "POP {r12}\n\t" "SUBS r12, r12, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_fe_pow22523_8_%=\n\t" + "BNE L_fe_pow22523_8\n\t" #else - "BNE.N L_fe_pow22523_8_%=\n\t" + "BNE.N L_fe_pow22523_8\n\t" #endif "LDR r2, [sp, #100]\n\t" "MOV r1, sp\n\t" @@ -6904,7 +6904,7 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) #endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* !defined(__aarch64__) && defined(__thumb__) */ #endif /* WOLFSSL_ARMASM */ #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S index 7c59e2548..30d8dc76b 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm.S @@ -113,7 +113,7 @@ Transform_Sha256_Len: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0xc0 ADR r3, L_SHA256_transform_len_k - # Copy digest to add in at end + /* Copy digest to add in at end */ LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] LDRD r8, r9, [r0, #16] @@ -122,9 +122,9 @@ Transform_Sha256_Len: STRD r6, r7, [sp, #72] STRD r8, r9, [sp, #80] STRD r10, r11, [sp, #88] - # Start of loop processing a block + /* Start of loop processing a block */ L_SHA256_transform_len_begin: - # Load, Reverse and Store W - 64 bytes + /* Load, Reverse and Store W - 64 bytes */ LDR r4, [r1] LDR r5, [r1, #4] LDR r6, [r1, #8] @@ -169,9 +169,9 @@ L_SHA256_transform_len_begin: LDR r4, [r0, #8] EOR r11, r11, r4 MOV r12, #0x3 - # Start of 16 rounds + /* Start of 16 rounds */ L_SHA256_transform_len_start: - # Round 0 + /* Round 0 */ LDR r5, [r0, #16] LDR r6, [r0, #20] LDR r7, [r0, #24] @@ -203,7 +203,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #12] STR r9, [r0, #28] - # Calc new W[0] + /* Calc new W[0] */ LDR r6, [sp, #56] LDR r7, [sp, #36] LDR r8, [sp, #4] @@ -218,7 +218,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp] - # Round 1 + /* Round 1 */ LDR r5, [r0, #12] LDR r6, [r0, #16] LDR r7, [r0, #20] @@ -250,7 +250,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #8] STR r9, [r0, #24] - # Calc new W[1] + /* Calc new W[1] */ LDR r6, [sp, #60] LDR r7, [sp, #40] LDR r8, [sp, #8] @@ -265,7 +265,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #4] - # Round 2 + /* Round 2 */ LDR r5, [r0, #8] LDR r6, [r0, #12] LDR r7, [r0, #16] @@ -297,7 +297,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #4] STR r9, [r0, #20] - # Calc new W[2] + /* Calc new W[2] */ LDR r6, [sp] LDR r7, [sp, #44] LDR r8, [sp, #12] @@ -312,7 +312,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #8] - # Round 3 + /* Round 3 */ LDR r5, [r0, #4] LDR r6, [r0, #8] LDR r7, [r0, #12] @@ -344,7 +344,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0] STR r9, [r0, #16] - # Calc new W[3] + /* Calc new W[3] */ LDR r6, [sp, #4] LDR r7, [sp, #48] LDR r8, [sp, #16] @@ -359,7 +359,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #12] - # Round 4 + /* Round 4 */ LDR r5, [r0] LDR r6, [r0, #4] LDR r7, [r0, #8] @@ -391,7 +391,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #28] STR r9, [r0, #12] - # Calc new W[4] + /* Calc new W[4] */ LDR r6, [sp, #8] LDR r7, [sp, #52] LDR r8, [sp, #20] @@ -406,7 +406,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #16] - # Round 5 + /* Round 5 */ LDR r5, [r0, #28] LDR r6, [r0] LDR r7, [r0, #4] @@ -438,7 +438,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #24] STR r9, [r0, #8] - # Calc new W[5] + /* Calc new W[5] */ LDR r6, [sp, #12] LDR r7, [sp, #56] LDR r8, [sp, #24] @@ -453,7 +453,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #20] - # Round 6 + /* Round 6 */ LDR r5, [r0, #24] LDR r6, [r0, #28] LDR r7, [r0] @@ -485,7 +485,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #20] STR r9, [r0, #4] - # Calc new W[6] + /* Calc new W[6] */ LDR r6, [sp, #16] LDR r7, [sp, #60] LDR r8, [sp, #28] @@ -500,7 +500,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #24] - # Round 7 + /* Round 7 */ LDR r5, [r0, #20] LDR r6, [r0, #24] LDR r7, [r0, #28] @@ -532,7 +532,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #16] STR r9, [r0] - # Calc new W[7] + /* Calc new W[7] */ LDR r6, [sp, #20] LDR r7, [sp] LDR r8, [sp, #32] @@ -547,7 +547,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #28] - # Round 8 + /* Round 8 */ LDR r5, [r0, #16] LDR r6, [r0, #20] LDR r7, [r0, #24] @@ -579,7 +579,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #12] STR r9, [r0, #28] - # Calc new W[8] + /* Calc new W[8] */ LDR r6, [sp, #24] LDR r7, [sp, #4] LDR r8, [sp, #36] @@ -594,7 +594,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #32] - # Round 9 + /* Round 9 */ LDR r5, [r0, #12] LDR r6, [r0, #16] LDR r7, [r0, #20] @@ -626,7 +626,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #8] STR r9, [r0, #24] - # Calc new W[9] + /* Calc new W[9] */ LDR r6, [sp, #28] LDR r7, [sp, #8] LDR r8, [sp, #40] @@ -641,7 +641,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #36] - # Round 10 + /* Round 10 */ LDR r5, [r0, #8] LDR r6, [r0, #12] LDR r7, [r0, #16] @@ -673,7 +673,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #4] STR r9, [r0, #20] - # Calc new W[10] + /* Calc new W[10] */ LDR r6, [sp, #32] LDR r7, [sp, #12] LDR r8, [sp, #44] @@ -688,7 +688,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #40] - # Round 11 + /* Round 11 */ LDR r5, [r0, #4] LDR r6, [r0, #8] LDR r7, [r0, #12] @@ -720,7 +720,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0] STR r9, [r0, #16] - # Calc new W[11] + /* Calc new W[11] */ LDR r6, [sp, #36] LDR r7, [sp, #16] LDR r8, [sp, #48] @@ -735,7 +735,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #44] - # Round 12 + /* Round 12 */ LDR r5, [r0] LDR r6, [r0, #4] LDR r7, [r0, #8] @@ -767,7 +767,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #28] STR r9, [r0, #12] - # Calc new W[12] + /* Calc new W[12] */ LDR r6, [sp, #40] LDR r7, [sp, #20] LDR r8, [sp, #52] @@ -782,7 +782,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #48] - # Round 13 + /* Round 13 */ LDR r5, [r0, #28] LDR r6, [r0] LDR r7, [r0, #4] @@ -814,7 +814,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #24] STR r9, [r0, #8] - # Calc new W[13] + /* Calc new W[13] */ LDR r6, [sp, #44] LDR r7, [sp, #24] LDR r8, [sp, #56] @@ -829,7 +829,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #52] - # Round 14 + /* Round 14 */ LDR r5, [r0, #24] LDR r6, [r0, #28] LDR r7, [r0] @@ -861,7 +861,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #20] STR r9, [r0, #4] - # Calc new W[14] + /* Calc new W[14] */ LDR r6, [sp, #48] LDR r7, [sp, #28] LDR r8, [sp, #60] @@ -876,7 +876,7 @@ L_SHA256_transform_len_start: ADD r4, r4, r5 ADD r9, r9, r4 STR r9, [sp, #56] - # Round 15 + /* Round 15 */ LDR r5, [r0, #20] LDR r6, [r0, #24] LDR r7, [r0, #28] @@ -908,7 +908,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #16] STR r9, [r0] - # Calc new W[15] + /* Calc new W[15] */ LDR r6, [sp, #52] LDR r7, [sp, #32] LDR r8, [sp] @@ -925,12 +925,12 @@ L_SHA256_transform_len_start: STR r9, [sp, #60] ADD r3, r3, #0x40 SUBS r12, r12, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA256_transform_len_start #else - BNE.N L_SHA256_transform_len_start + BNE.W L_SHA256_transform_len_start #endif - # Round 0 + /* Round 0 */ LDR r5, [r0, #16] LDR r6, [r0, #20] LDR r7, [r0, #24] @@ -962,7 +962,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #12] STR r9, [r0, #28] - # Round 1 + /* Round 1 */ LDR r5, [r0, #12] LDR r6, [r0, #16] LDR r7, [r0, #20] @@ -994,7 +994,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #8] STR r9, [r0, #24] - # Round 2 + /* Round 2 */ LDR r5, [r0, #8] LDR r6, [r0, #12] LDR r7, [r0, #16] @@ -1026,7 +1026,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #4] STR r9, [r0, #20] - # Round 3 + /* Round 3 */ LDR r5, [r0, #4] LDR r6, [r0, #8] LDR r7, [r0, #12] @@ -1058,7 +1058,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0] STR r9, [r0, #16] - # Round 4 + /* Round 4 */ LDR r5, [r0] LDR r6, [r0, #4] LDR r7, [r0, #8] @@ -1090,7 +1090,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #28] STR r9, [r0, #12] - # Round 5 + /* Round 5 */ LDR r5, [r0, #28] LDR r6, [r0] LDR r7, [r0, #4] @@ -1122,7 +1122,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #24] STR r9, [r0, #8] - # Round 6 + /* Round 6 */ LDR r5, [r0, #24] LDR r6, [r0, #28] LDR r7, [r0] @@ -1154,7 +1154,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #20] STR r9, [r0, #4] - # Round 7 + /* Round 7 */ LDR r5, [r0, #20] LDR r6, [r0, #24] LDR r7, [r0, #28] @@ -1186,7 +1186,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #16] STR r9, [r0] - # Round 8 + /* Round 8 */ LDR r5, [r0, #16] LDR r6, [r0, #20] LDR r7, [r0, #24] @@ -1218,7 +1218,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #12] STR r9, [r0, #28] - # Round 9 + /* Round 9 */ LDR r5, [r0, #12] LDR r6, [r0, #16] LDR r7, [r0, #20] @@ -1250,7 +1250,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #8] STR r9, [r0, #24] - # Round 10 + /* Round 10 */ LDR r5, [r0, #8] LDR r6, [r0, #12] LDR r7, [r0, #16] @@ -1282,7 +1282,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #4] STR r9, [r0, #20] - # Round 11 + /* Round 11 */ LDR r5, [r0, #4] LDR r6, [r0, #8] LDR r7, [r0, #12] @@ -1314,7 +1314,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0] STR r9, [r0, #16] - # Round 12 + /* Round 12 */ LDR r5, [r0] LDR r6, [r0, #4] LDR r7, [r0, #8] @@ -1346,7 +1346,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #28] STR r9, [r0, #12] - # Round 13 + /* Round 13 */ LDR r5, [r0, #28] LDR r6, [r0] LDR r7, [r0, #4] @@ -1378,7 +1378,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #24] STR r9, [r0, #8] - # Round 14 + /* Round 14 */ LDR r5, [r0, #24] LDR r6, [r0, #28] LDR r7, [r0] @@ -1410,7 +1410,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r11 STR r8, [r0, #20] STR r9, [r0, #4] - # Round 15 + /* Round 15 */ LDR r5, [r0, #20] LDR r6, [r0, #24] LDR r7, [r0, #28] @@ -1442,7 +1442,7 @@ L_SHA256_transform_len_start: ADD r9, r9, r10 STR r8, [r0, #16] STR r9, [r0] - # Add in digest from start + /* Add in digest from start */ LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] LDRD r8, r9, [sp, #64] @@ -1470,14 +1470,14 @@ L_SHA256_transform_len_start: SUBS r2, r2, #0x40 SUB r3, r3, #0xc0 ADD r1, r1, #0x40 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA256_transform_len_begin #else - BNE.N L_SHA256_transform_len_begin + BNE.W L_SHA256_transform_len_begin #endif ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 1874 + /* Cycle Count = 1874 */ .size Transform_Sha256_Len,.-Transform_Sha256_Len #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* !NO_SHA256 */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c index 2483f036d..a2367c2a2 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha256-asm_c.c @@ -39,7 +39,7 @@ #ifdef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__arm__) +#if !defined(__aarch64__) && defined(__thumb__) #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm @@ -84,8 +84,8 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) register wc_Sha256* sha256 __asm__ ("r0") = (wc_Sha256*)sha256_p; register const byte* data __asm__ ("r1") = (const byte*)data_p; register word32 len __asm__ ("r2") = (word32)len_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint32_t* L_SHA256_transform_len_k_c __asm__ ("r3") = (uint32_t*)&L_SHA256_transform_len_k; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( "SUB sp, sp, #0xc0\n\t" @@ -101,7 +101,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "STRD r10, r11, [sp, #88]\n\t" /* Start of loop processing a block */ "\n" - "L_SHA256_transform_len_begin_%=:\n\t" + "L_SHA256_transform_len_begin:\n\t" /* Load, Reverse and Store W - 64 bytes */ "LDR r4, [%[data]]\n\t" "LDR r5, [%[data], #4]\n\t" @@ -149,7 +149,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "MOV r12, #0x3\n\t" /* Start of 16 rounds */ "\n" - "L_SHA256_transform_len_start_%=:\n\t" + "L_SHA256_transform_len_start:\n\t" /* Round 0 */ "LDR r5, [%[sha256], #16]\n\t" "LDR r6, [%[sha256], #20]\n\t" @@ -904,10 +904,10 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "STR r9, [sp, #60]\n\t" "ADD r3, r3, #0x40\n\t" "SUBS r12, r12, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_SHA256_transform_len_start_%=\n\t" +#ifdef __GNUC__ + "BNE L_SHA256_transform_len_start\n\t" #else - "BNE.N L_SHA256_transform_len_start_%=\n\t" + "BNE.W L_SHA256_transform_len_start\n\t" #endif /* Round 0 */ "LDR r5, [%[sha256], #16]\n\t" @@ -1449,14 +1449,20 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) "SUBS %[len], %[len], #0x40\n\t" "SUB r3, r3, #0xc0\n\t" "ADD %[data], %[data], #0x40\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_SHA256_transform_len_begin_%=\n\t" +#ifdef __GNUC__ + "BNE L_SHA256_transform_len_begin\n\t" #else - "BNE.N L_SHA256_transform_len_begin_%=\n\t" + "BNE.W L_SHA256_transform_len_begin\n\t" #endif "ADD sp, sp, #0xc0\n\t" - : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len), + [L_SHA256_transform_len_k] "+r" (L_SHA256_transform_len_k_c) : +#else + : [sha256] "+r" (sha256), [data] "+r" (data), [len] "+r" (len) + : [L_SHA256_transform_len_k] "r" (L_SHA256_transform_len_k) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -1465,7 +1471,7 @@ void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data, word32 len) #endif /* !NO_SHA256 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* !defined(__aarch64__) && defined(__thumb__) */ #endif /* WOLFSSL_ARMASM */ #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S index b420e7863..6031b9240 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm.S @@ -209,7 +209,7 @@ Transform_Sha512_Len: PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} SUB sp, sp, #0xc0 ADR r3, L_SHA512_transform_len_k - # Copy digest to add in at end + /* Copy digest to add in at end */ LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] LDRD r8, r9, [r0, #16] @@ -226,9 +226,9 @@ Transform_Sha512_Len: STRD r6, r7, [sp, #168] STRD r8, r9, [sp, #176] STRD r10, r11, [sp, #184] - # Start of loop processing a block + /* Start of loop processing a block */ L_SHA512_transform_len_begin: - # Load, Reverse and Store W + /* Load, Reverse and Store W */ LDR r4, [r1] LDR r5, [r1, #4] LDR r6, [r1, #8] @@ -325,15 +325,15 @@ L_SHA512_transform_len_begin: STR r8, [sp, #116] STR r11, [sp, #120] STR r10, [sp, #124] - # Pre-calc: b ^ c + /* Pre-calc: b ^ c */ LDRD r10, r11, [r0, #8] LDRD r4, r5, [r0, #16] EOR r10, r10, r4 EOR r11, r11, r5 MOV r12, #0x4 - # Start of 16 rounds + /* Start of 16 rounds */ L_SHA512_transform_len_start: - # Round 0 + /* Round 0 */ LDRD r4, r5, [r0, #32] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -413,7 +413,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #56] MOV r10, r8 MOV r11, r9 - # Calc new W[0] + /* Calc new W[0] */ LDRD r4, r5, [sp, #112] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -457,7 +457,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp] - # Round 1 + /* Round 1 */ LDRD r4, r5, [r0, #24] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -537,7 +537,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #48] MOV r10, r8 MOV r11, r9 - # Calc new W[1] + /* Calc new W[1] */ LDRD r4, r5, [sp, #120] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -581,7 +581,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #8] - # Round 2 + /* Round 2 */ LDRD r4, r5, [r0, #16] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -661,7 +661,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #40] MOV r10, r8 MOV r11, r9 - # Calc new W[2] + /* Calc new W[2] */ LDRD r4, r5, [sp] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -705,7 +705,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #16] - # Round 3 + /* Round 3 */ LDRD r4, r5, [r0, #8] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -785,7 +785,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #32] MOV r10, r8 MOV r11, r9 - # Calc new W[3] + /* Calc new W[3] */ LDRD r4, r5, [sp, #8] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -829,7 +829,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #24] - # Round 4 + /* Round 4 */ LDRD r4, r5, [r0] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -909,7 +909,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #24] MOV r10, r8 MOV r11, r9 - # Calc new W[4] + /* Calc new W[4] */ LDRD r4, r5, [sp, #16] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -953,7 +953,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #32] - # Round 5 + /* Round 5 */ LDRD r4, r5, [r0, #56] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1033,7 +1033,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #16] MOV r10, r8 MOV r11, r9 - # Calc new W[5] + /* Calc new W[5] */ LDRD r4, r5, [sp, #24] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1077,7 +1077,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #40] - # Round 6 + /* Round 6 */ LDRD r4, r5, [r0, #48] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1157,7 +1157,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #8] MOV r10, r8 MOV r11, r9 - # Calc new W[6] + /* Calc new W[6] */ LDRD r4, r5, [sp, #32] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1201,7 +1201,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #48] - # Round 7 + /* Round 7 */ LDRD r4, r5, [r0, #40] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1281,7 +1281,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0] MOV r10, r8 MOV r11, r9 - # Calc new W[7] + /* Calc new W[7] */ LDRD r4, r5, [sp, #40] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1325,7 +1325,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #56] - # Round 8 + /* Round 8 */ LDRD r4, r5, [r0, #32] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1405,7 +1405,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #56] MOV r10, r8 MOV r11, r9 - # Calc new W[8] + /* Calc new W[8] */ LDRD r4, r5, [sp, #48] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1449,7 +1449,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #64] - # Round 9 + /* Round 9 */ LDRD r4, r5, [r0, #24] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1529,7 +1529,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #48] MOV r10, r8 MOV r11, r9 - # Calc new W[9] + /* Calc new W[9] */ LDRD r4, r5, [sp, #56] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1573,7 +1573,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #72] - # Round 10 + /* Round 10 */ LDRD r4, r5, [r0, #16] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1653,7 +1653,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #40] MOV r10, r8 MOV r11, r9 - # Calc new W[10] + /* Calc new W[10] */ LDRD r4, r5, [sp, #64] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1697,7 +1697,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #80] - # Round 11 + /* Round 11 */ LDRD r4, r5, [r0, #8] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1777,7 +1777,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #32] MOV r10, r8 MOV r11, r9 - # Calc new W[11] + /* Calc new W[11] */ LDRD r4, r5, [sp, #72] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1821,7 +1821,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #88] - # Round 12 + /* Round 12 */ LDRD r4, r5, [r0] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -1901,7 +1901,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #24] MOV r10, r8 MOV r11, r9 - # Calc new W[12] + /* Calc new W[12] */ LDRD r4, r5, [sp, #80] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -1945,7 +1945,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #96] - # Round 13 + /* Round 13 */ LDRD r4, r5, [r0, #56] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2025,7 +2025,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #16] MOV r10, r8 MOV r11, r9 - # Calc new W[13] + /* Calc new W[13] */ LDRD r4, r5, [sp, #88] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -2069,7 +2069,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #104] - # Round 14 + /* Round 14 */ LDRD r4, r5, [r0, #48] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2149,7 +2149,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #8] MOV r10, r8 MOV r11, r9 - # Calc new W[14] + /* Calc new W[14] */ LDRD r4, r5, [sp, #96] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -2193,7 +2193,7 @@ L_SHA512_transform_len_start: ADDS r4, r4, r6 ADC r5, r5, r7 STRD r4, r5, [sp, #112] - # Round 15 + /* Round 15 */ LDRD r4, r5, [r0, #40] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2273,7 +2273,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0] MOV r10, r8 MOV r11, r9 - # Calc new W[15] + /* Calc new W[15] */ LDRD r4, r5, [sp, #104] LSRS r6, r4, #19 LSRS r7, r5, #19 @@ -2319,12 +2319,12 @@ L_SHA512_transform_len_start: STRD r4, r5, [sp, #120] ADD r3, r3, #0x80 SUBS r12, r12, #0x1 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA512_transform_len_start #else - BNE.N L_SHA512_transform_len_start + BNE.W L_SHA512_transform_len_start #endif - # Round 0 + /* Round 0 */ LDRD r4, r5, [r0, #32] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2404,7 +2404,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #56] MOV r10, r8 MOV r11, r9 - # Round 1 + /* Round 1 */ LDRD r4, r5, [r0, #24] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2484,7 +2484,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #48] MOV r10, r8 MOV r11, r9 - # Round 2 + /* Round 2 */ LDRD r4, r5, [r0, #16] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2564,7 +2564,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #40] MOV r10, r8 MOV r11, r9 - # Round 3 + /* Round 3 */ LDRD r4, r5, [r0, #8] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2644,7 +2644,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #32] MOV r10, r8 MOV r11, r9 - # Round 4 + /* Round 4 */ LDRD r4, r5, [r0] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2724,7 +2724,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #24] MOV r10, r8 MOV r11, r9 - # Round 5 + /* Round 5 */ LDRD r4, r5, [r0, #56] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2804,7 +2804,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #16] MOV r10, r8 MOV r11, r9 - # Round 6 + /* Round 6 */ LDRD r4, r5, [r0, #48] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2884,7 +2884,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #8] MOV r10, r8 MOV r11, r9 - # Round 7 + /* Round 7 */ LDRD r4, r5, [r0, #40] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -2964,7 +2964,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0] MOV r10, r8 MOV r11, r9 - # Round 8 + /* Round 8 */ LDRD r4, r5, [r0, #32] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3044,7 +3044,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #56] MOV r10, r8 MOV r11, r9 - # Round 9 + /* Round 9 */ LDRD r4, r5, [r0, #24] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3124,7 +3124,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #48] MOV r10, r8 MOV r11, r9 - # Round 10 + /* Round 10 */ LDRD r4, r5, [r0, #16] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3204,7 +3204,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #40] MOV r10, r8 MOV r11, r9 - # Round 11 + /* Round 11 */ LDRD r4, r5, [r0, #8] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3284,7 +3284,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #32] MOV r10, r8 MOV r11, r9 - # Round 12 + /* Round 12 */ LDRD r4, r5, [r0] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3364,7 +3364,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #24] MOV r10, r8 MOV r11, r9 - # Round 13 + /* Round 13 */ LDRD r4, r5, [r0, #56] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3444,7 +3444,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #16] MOV r10, r8 MOV r11, r9 - # Round 14 + /* Round 14 */ LDRD r4, r5, [r0, #48] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3524,7 +3524,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0, #8] MOV r10, r8 MOV r11, r9 - # Round 15 + /* Round 15 */ LDRD r4, r5, [r0, #40] LSRS r6, r4, #14 LSRS r7, r5, #14 @@ -3604,7 +3604,7 @@ L_SHA512_transform_len_start: STRD r6, r7, [r0] MOV r10, r8 MOV r11, r9 - # Add in digest from start + /* Add in digest from start */ LDRD r4, r5, [r0] LDRD r6, r7, [r0, #8] LDRD r8, r9, [sp, #128] @@ -3656,15 +3656,15 @@ L_SHA512_transform_len_start: SUBS r2, r2, #0x80 SUB r3, r3, #0x200 ADD r1, r1, #0x80 -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) +#ifdef __GNUC__ BNE L_SHA512_transform_len_begin #else - BNE.N L_SHA512_transform_len_begin + BNE.W L_SHA512_transform_len_begin #endif EOR r0, r0, r0 ADD sp, sp, #0xc0 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - # Cycle Count = 5021 + /* Cycle Count = 5021 */ .size Transform_Sha512_Len,.-Transform_Sha512_Len #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* WOLFSSL_SHA512 */ diff --git a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c index 3dc2d1f20..7521b35fa 100644 --- a/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-sha512-asm_c.c @@ -39,7 +39,7 @@ #ifdef WOLFSSL_ARMASM_INLINE #ifdef WOLFSSL_ARMASM -#if !defined(__aarch64__) && defined(__arm__) +#if !defined(__aarch64__) && defined(__thumb__) #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm @@ -108,8 +108,8 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) register wc_Sha512* sha512 __asm__ ("r0") = (wc_Sha512*)sha512_p; register const byte* data __asm__ ("r1") = (const byte*)data_p; register word32 len __asm__ ("r2") = (word32)len_p; -#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ register uint64_t* L_SHA512_transform_len_k_c __asm__ ("r3") = (uint64_t*)&L_SHA512_transform_len_k; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ __asm__ __volatile__ ( "SUB sp, sp, #0xc0\n\t" @@ -133,7 +133,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "STRD r10, r11, [sp, #184]\n\t" /* Start of loop processing a block */ "\n" - "L_SHA512_transform_len_begin_%=:\n\t" + "L_SHA512_transform_len_begin:\n\t" /* Load, Reverse and Store W */ "LDR r4, [%[data]]\n\t" "LDR r5, [%[data], #4]\n\t" @@ -239,7 +239,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "MOV r12, #0x4\n\t" /* Start of 16 rounds */ "\n" - "L_SHA512_transform_len_start_%=:\n\t" + "L_SHA512_transform_len_start:\n\t" /* Round 0 */ "LDRD r4, r5, [%[sha512], #32]\n\t" "LSRS r6, r4, #14\n\t" @@ -2226,10 +2226,10 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "STRD r4, r5, [sp, #120]\n\t" "ADD r3, r3, #0x80\n\t" "SUBS r12, r12, #0x1\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_SHA512_transform_len_start_%=\n\t" +#ifdef __GNUC__ + "BNE L_SHA512_transform_len_start\n\t" #else - "BNE.N L_SHA512_transform_len_start_%=\n\t" + "BNE.W L_SHA512_transform_len_start\n\t" #endif /* Round 0 */ "LDRD r4, r5, [%[sha512], #32]\n\t" @@ -3563,15 +3563,21 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) "SUBS %[len], %[len], #0x80\n\t" "SUB r3, r3, #0x200\n\t" "ADD %[data], %[data], #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_SHA512_transform_len_begin_%=\n\t" +#ifdef __GNUC__ + "BNE L_SHA512_transform_len_begin\n\t" #else - "BNE.N L_SHA512_transform_len_begin_%=\n\t" + "BNE.W L_SHA512_transform_len_begin\n\t" #endif "EOR r0, r0, r0\n\t" "ADD sp, sp, #0xc0\n\t" - : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len), + [L_SHA512_transform_len_k] "+r" (L_SHA512_transform_len_k_c) : +#else + : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) + : [L_SHA512_transform_len_k] "r" (L_SHA512_transform_len_k) +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12" ); } @@ -3580,7 +3586,7 @@ void Transform_Sha512_Len(wc_Sha512* sha512, const byte* data, word32 len) #endif /* WOLFSSL_SHA512 */ #endif /* !__aarch64__ && __thumb__ */ #endif /* WOLFSSL_ARMASM */ -#endif /* !defined(__aarch64__) && defined(__arm__) */ +#endif /* !defined(__aarch64__) && defined(__thumb__) */ #endif /* WOLFSSL_ARMASM */ #endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c index a1ae275de..54423c2d5 100644 --- a/wolfcrypt/src/sp_arm32.c +++ b/wolfcrypt/src/sp_arm32.c @@ -55,6 +55,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm @@ -5403,10 +5404,13 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_mul_64_outer_%=: \n\t" "subs r3, r5, #0xfc\n\t" @@ -5451,13 +5455,86 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x100\n\t" - "beq L_sp_2048_mul_64_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_mul_64_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_mul_64_inner_done_%=\n\t" + "blt L_sp_2048_mul_64_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_2048_mul_64_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -5465,14 +5542,46 @@ static void sp_2048_mul_64(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x1f8\n\t" + "cmp r5, #0x1f4\n\t" "ble L_sp_2048_mul_64_outer_%=\n\t" + "ldr lr, [%[a], #252]\n\t" + "ldr r11, [%[b], #252]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_64_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_mul_64_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -5492,10 +5601,12 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x200\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_sqr_64_outer_%=: \n\t" "subs r3, r5, #0xfc\n\t" @@ -5504,8 +5615,6 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_64_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_2048_sqr_64_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -5557,9 +5666,11 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_2048_sqr_64_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_64_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_sqr_64_inner_done_%=\n\t" + "blt L_sp_2048_sqr_64_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -5588,30 +5699,46 @@ static void sp_2048_sqr_64(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_2048_sqr_64_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x100\n\t" - "beq L_sp_2048_sqr_64_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_2048_sqr_64_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_sqr_64_inner_%=\n\t" - "\n" "L_sp_2048_sqr_64_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x1f8\n\t" + "cmp r5, #0x1f4\n\t" "ble L_sp_2048_sqr_64_outer_%=\n\t" + "ldr lr, [%[a], #252]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_64_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_sqr_64_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -5728,10 +5855,13 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_mul_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -5776,13 +5906,86 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_2048_mul_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_mul_32_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_mul_32_inner_done_%=\n\t" + "blt L_sp_2048_mul_32_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_2048_mul_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -5790,14 +5993,46 @@ static void sp_2048_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_2048_mul_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" + "ldr r11, [%[b], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_mul_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_mul_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -5817,10 +6052,12 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_2048_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -5829,8 +6066,6 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_2048_sqr_32_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_2048_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -5882,9 +6117,11 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_2048_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_32_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_2048_sqr_32_inner_done_%=\n\t" + "blt L_sp_2048_sqr_32_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -5913,30 +6150,46 @@ static void sp_2048_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_2048_sqr_32_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_2048_sqr_32_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_2048_sqr_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_2048_sqr_32_inner_%=\n\t" - "\n" "L_sp_2048_sqr_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_2048_sqr_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_2048_sqr_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_2048_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -28088,10 +28341,13 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_mul_96_outer_%=: \n\t" "subs r3, r5, #0x17c\n\t" @@ -28136,13 +28392,86 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x180\n\t" - "beq L_sp_3072_mul_96_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_mul_96_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_mul_96_inner_done_%=\n\t" + "blt L_sp_3072_mul_96_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_3072_mul_96_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -28150,14 +28479,46 @@ static void sp_3072_mul_96(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x2f8\n\t" + "cmp r5, #0x2f4\n\t" "ble L_sp_3072_mul_96_outer_%=\n\t" + "ldr lr, [%[a], #380]\n\t" + "ldr r11, [%[b], #380]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_96_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_mul_96_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -28177,10 +28538,12 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x300\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_sqr_96_outer_%=: \n\t" "subs r3, r5, #0x17c\n\t" @@ -28189,8 +28552,6 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_96_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_3072_sqr_96_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -28242,9 +28603,11 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_3072_sqr_96_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_96_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_sqr_96_inner_done_%=\n\t" + "blt L_sp_3072_sqr_96_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -28273,30 +28636,46 @@ static void sp_3072_sqr_96(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_3072_sqr_96_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x180\n\t" - "beq L_sp_3072_sqr_96_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_3072_sqr_96_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_sqr_96_inner_%=\n\t" - "\n" "L_sp_3072_sqr_96_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x2f8\n\t" + "cmp r5, #0x2f4\n\t" "ble L_sp_3072_sqr_96_outer_%=\n\t" + "ldr lr, [%[a], #380]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_96_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_sqr_96_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -28413,10 +28792,13 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_mul_48_outer_%=: \n\t" "subs r3, r5, #0xbc\n\t" @@ -28461,13 +28843,86 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0xc0\n\t" - "beq L_sp_3072_mul_48_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_mul_48_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_mul_48_inner_done_%=\n\t" + "blt L_sp_3072_mul_48_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_3072_mul_48_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -28475,14 +28930,46 @@ static void sp_3072_mul_48(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x178\n\t" + "cmp r5, #0x174\n\t" "ble L_sp_3072_mul_48_outer_%=\n\t" + "ldr lr, [%[a], #188]\n\t" + "ldr r11, [%[b], #188]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_mul_48_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_mul_48_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -28502,10 +28989,12 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x180\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_3072_sqr_48_outer_%=: \n\t" "subs r3, r5, #0xbc\n\t" @@ -28514,8 +29003,6 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_3072_sqr_48_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_3072_sqr_48_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -28567,9 +29054,11 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_3072_sqr_48_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_48_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_3072_sqr_48_inner_done_%=\n\t" + "blt L_sp_3072_sqr_48_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -28598,30 +29087,46 @@ static void sp_3072_sqr_48(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_3072_sqr_48_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0xc0\n\t" - "beq L_sp_3072_sqr_48_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_3072_sqr_48_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_3072_sqr_48_inner_%=\n\t" - "\n" "L_sp_3072_sqr_48_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x178\n\t" + "cmp r5, #0x174\n\t" "ble L_sp_3072_sqr_48_outer_%=\n\t" + "ldr lr, [%[a], #188]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_3072_sqr_48_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_3072_sqr_48_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -46058,10 +46563,13 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_4096_mul_128_outer_%=: \n\t" "subs r3, r5, #0x1fc\n\t" @@ -46106,13 +46614,86 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x200\n\t" - "beq L_sp_4096_mul_128_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_4096_mul_128_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_4096_mul_128_inner_done_%=\n\t" + "blt L_sp_4096_mul_128_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_4096_mul_128_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -46120,14 +46701,46 @@ static void sp_4096_mul_128(sp_digit* r_p, const sp_digit* a_p, const sp_digit* "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x3f8\n\t" + "cmp r5, #0x3f4\n\t" "ble L_sp_4096_mul_128_outer_%=\n\t" + "ldr lr, [%[a], #508]\n\t" + "ldr r11, [%[b], #508]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_4096_mul_128_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_4096_mul_128_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -46147,10 +46760,12 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x400\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_4096_sqr_128_outer_%=: \n\t" "subs r3, r5, #0x1fc\n\t" @@ -46159,8 +46774,6 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_4096_sqr_128_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_4096_sqr_128_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -46212,9 +46825,11 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_4096_sqr_128_op_done_%=\n\t" - "\n" - "L_sp_4096_sqr_128_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_4096_sqr_128_inner_done_%=\n\t" + "blt L_sp_4096_sqr_128_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -46243,30 +46858,46 @@ static void sp_4096_sqr_128(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_4096_sqr_128_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x200\n\t" - "beq L_sp_4096_sqr_128_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_4096_sqr_128_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_4096_sqr_128_inner_%=\n\t" - "\n" "L_sp_4096_sqr_128_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x3f8\n\t" + "cmp r5, #0x3f4\n\t" "ble L_sp_4096_sqr_128_outer_%=\n\t" + "ldr lr, [%[a], #508]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_4096_sqr_128_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_4096_sqr_128_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -60831,10 +61462,13 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_256_mul_8_outer_%=: \n\t" "subs r3, r5, #28\n\t" @@ -60879,13 +61513,86 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #32\n\t" - "beq L_sp_256_mul_8_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_256_mul_8_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_256_mul_8_inner_done_%=\n\t" + "blt L_sp_256_mul_8_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_256_mul_8_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -60893,14 +61600,46 @@ static void sp_256_mul_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_p "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #56\n\t" + "cmp r5, #52\n\t" "ble L_sp_256_mul_8_outer_%=\n\t" + "ldr lr, [%[a], #28]\n\t" + "ldr r11, [%[b], #28]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_256_mul_8_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_256_mul_8_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -63403,10 +64142,12 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x40\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_256_sqr_8_outer_%=: \n\t" "subs r3, r5, #28\n\t" @@ -63415,8 +64156,6 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_256_sqr_8_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_256_sqr_8_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -63468,9 +64207,11 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_256_sqr_8_op_done_%=\n\t" - "\n" - "L_sp_256_sqr_8_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_256_sqr_8_inner_done_%=\n\t" + "blt L_sp_256_sqr_8_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -63499,30 +64240,46 @@ static void sp_256_sqr_8(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_256_sqr_8_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #32\n\t" - "beq L_sp_256_sqr_8_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_256_sqr_8_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_256_sqr_8_inner_%=\n\t" - "\n" "L_sp_256_sqr_8_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #56\n\t" + "cmp r5, #52\n\t" "ble L_sp_256_sqr_8_outer_%=\n\t" + "ldr lr, [%[a], #28]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_256_sqr_8_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_256_sqr_8_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -79028,10 +79785,13 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_384_mul_12_outer_%=: \n\t" "subs r3, r5, #44\n\t" @@ -79076,13 +79836,86 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #48\n\t" - "beq L_sp_384_mul_12_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_384_mul_12_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_384_mul_12_inner_done_%=\n\t" + "blt L_sp_384_mul_12_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_384_mul_12_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -79090,14 +79923,46 @@ static void sp_384_mul_12(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x58\n\t" + "cmp r5, #0x54\n\t" "ble L_sp_384_mul_12_outer_%=\n\t" + "ldr lr, [%[a], #44]\n\t" + "ldr r11, [%[b], #44]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_384_mul_12_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_384_mul_12_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -84616,10 +85481,12 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x60\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_384_sqr_12_outer_%=: \n\t" "subs r3, r5, #44\n\t" @@ -84628,8 +85495,6 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_384_sqr_12_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_384_sqr_12_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -84681,9 +85546,11 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_384_sqr_12_op_done_%=\n\t" - "\n" - "L_sp_384_sqr_12_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_384_sqr_12_inner_done_%=\n\t" + "blt L_sp_384_sqr_12_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -84712,30 +85579,46 @@ static void sp_384_sqr_12(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_384_sqr_12_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #48\n\t" - "beq L_sp_384_sqr_12_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_384_sqr_12_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_384_sqr_12_inner_%=\n\t" - "\n" "L_sp_384_sqr_12_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x58\n\t" + "cmp r5, #0x54\n\t" "ble L_sp_384_sqr_12_outer_%=\n\t" + "ldr lr, [%[a], #44]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_384_sqr_12_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_384_sqr_12_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -97020,10 +97903,13 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_521_mul_17_outer_%=: \n\t" "subs r3, r5, #0x40\n\t" @@ -97068,13 +97954,86 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x44\n\t" - "beq L_sp_521_mul_17_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_521_mul_17_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_521_mul_17_inner_done_%=\n\t" + "blt L_sp_521_mul_17_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_521_mul_17_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -97082,17 +98041,49 @@ static void sp_521_mul_17(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b_ "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x80\n\t" + "cmp r5, #0x7c\n\t" "ble L_sp_521_mul_17_outer_%=\n\t" + "ldr lr, [%[a], #64]\n\t" + "ldr r11, [%[b], #64]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "ldm sp!, {r6, r7}\n\t" "stm %[r]!, {r6, r7}\n\t" "sub r5, r5, #8\n\t" "\n" "L_sp_521_mul_17_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_521_mul_17_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -108130,10 +109121,12 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x88\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_521_sqr_17_outer_%=: \n\t" "subs r3, r5, #0x40\n\t" @@ -108142,8 +109135,6 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_521_sqr_17_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_521_sqr_17_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -108195,9 +109186,11 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_521_sqr_17_op_done_%=\n\t" - "\n" - "L_sp_521_sqr_17_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_521_sqr_17_inner_done_%=\n\t" + "blt L_sp_521_sqr_17_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -108226,33 +109219,49 @@ static void sp_521_sqr_17(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_521_sqr_17_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x44\n\t" - "beq L_sp_521_sqr_17_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_521_sqr_17_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_521_sqr_17_inner_%=\n\t" - "\n" "L_sp_521_sqr_17_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0x80\n\t" + "cmp r5, #0x7c\n\t" "ble L_sp_521_sqr_17_outer_%=\n\t" + "ldr lr, [%[a], #64]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "ldm sp!, {r6, r7}\n\t" "stm %[r]!, {r6, r7}\n\t" "sub r5, r5, #8\n\t" "\n" "L_sp_521_sqr_17_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_521_sqr_17_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : @@ -141063,10 +142072,13 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r5, #0\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "ldr r11, [%[b]]\n\t" + "umull r8, r6, lr, r11\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_1024_mul_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -141111,13 +142123,86 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "adds r6, r6, r9\n\t" "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" +#endif + "ldr lr, [%[a], r4]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" #endif "add r3, r3, #4\n\t" "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_1024_mul_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_1024_mul_32_inner_%=\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_1024_mul_32_inner_done_%=\n\t" + "blt L_sp_1024_mul_32_inner_%=\n\t" + "ldr lr, [%[a], r3]\n\t" + "ldr r11, [%[b], r3]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adcs r7, r7, #0\n\t" + "adc r8, r8, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#else + "umull r9, r10, lr, r11\n\t" + "adds r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "adc r8, r8, #0\n\t" +#endif "\n" "L_sp_1024_mul_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" @@ -141125,14 +142210,46 @@ static void sp_1024_mul_32(sp_digit* r_p, const sp_digit* a_p, const sp_digit* b "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_1024_mul_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" + "ldr r11, [%[b], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsl r10, r11, #16\n\t" + "lsr r9, r9, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r10, r9, r10\n\t" + "adds r6, r6, r10\n\t" + "adc r7, r7, #0\n\t" + "lsr r10, r11, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r9, lr, #16\n\t" + "lsr r10, r11, #16\n\t" + "mul r10, r9, r10\n\t" + "add r7, r7, r10\n\t" + "lsl r10, r11, #16\n\t" + "lsr r10, r10, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #16\n\t" + "lsl r9, r9, #16\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umlal r6, r7, lr, r11\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_1024_mul_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_1024_mul_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -141152,10 +142269,12 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) __asm__ __volatile__ ( "sub sp, sp, #0x100\n\t" - "mov r6, #0\n\t" + "ldr lr, [%[a]]\n\t" + "umull r8, r6, lr, lr\n\t" + "str r8, [sp]\n\t" "mov r7, #0\n\t" "mov r8, #0\n\t" - "mov r5, #0\n\t" + "mov r5, #4\n\t" "\n" "L_sp_1024_sqr_32_outer_%=: \n\t" "subs r3, r5, #0x7c\n\t" @@ -141164,8 +142283,6 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "sub r4, r5, r3\n\t" "\n" "L_sp_1024_sqr_32_inner_%=: \n\t" - "cmp r4, r3\n\t" - "beq L_sp_1024_sqr_32_op_sqr_%=\n\t" "ldr lr, [%[a], r3]\n\t" "ldr r11, [%[a], r4]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) @@ -141217,9 +142334,11 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adcs r7, r7, r10\n\t" "adc r8, r8, #0\n\t" #endif - "bal L_sp_1024_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_1024_sqr_32_op_sqr_%=: \n\t" + "add r3, r3, #4\n\t" + "sub r4, r4, #4\n\t" + "cmp r3, r4\n\t" + "bgt L_sp_1024_sqr_32_inner_done_%=\n\t" + "blt L_sp_1024_sqr_32_inner_%=\n\t" "ldr lr, [%[a], r3]\n\t" #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) "lsl r9, lr, #16\n\t" @@ -141248,30 +142367,46 @@ static void sp_1024_sqr_32(sp_digit* r_p, const sp_digit* a_p) "adc r8, r8, #0\n\t" #endif "\n" - "L_sp_1024_sqr_32_op_done_%=: \n\t" - "add r3, r3, #4\n\t" - "sub r4, r4, #4\n\t" - "cmp r3, #0x80\n\t" - "beq L_sp_1024_sqr_32_inner_done_%=\n\t" - "cmp r3, r4\n\t" - "bgt L_sp_1024_sqr_32_inner_done_%=\n\t" - "cmp r3, r5\n\t" - "ble L_sp_1024_sqr_32_inner_%=\n\t" - "\n" "L_sp_1024_sqr_32_inner_done_%=: \n\t" "str r6, [sp, r5]\n\t" "mov r6, r7\n\t" "mov r7, r8\n\t" "mov r8, #0\n\t" "add r5, r5, #4\n\t" - "cmp r5, #0xf8\n\t" + "cmp r5, #0xf4\n\t" "ble L_sp_1024_sqr_32_outer_%=\n\t" + "ldr lr, [%[a], #124]\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 4) + "lsl r9, lr, #16\n\t" + "lsr r10, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mov r11, r9\n\t" + "mul r9, r11, r9\n\t" + "mov r11, r10\n\t" + "mul r10, r11, r10\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" + "lsr r10, lr, #16\n\t" + "lsl r9, lr, #16\n\t" + "lsr r9, r9, #16\n\t" + "mul r9, r10, r9\n\t" + "lsr r10, r9, #15\n\t" + "lsl r9, r9, #17\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#else + "umull r9, r10, lr, lr\n\t" + "adds r6, r6, r9\n\t" + "adc r7, r7, r10\n\t" +#endif "str r6, [sp, r5]\n\t" + "add r5, r5, #4\n\t" + "str r7, [sp, r5]\n\t" "\n" "L_sp_1024_sqr_32_store_%=: \n\t" - "ldm sp!, {r6, r7, r8, r9}\n\t" - "stm %[r]!, {r6, r7, r8, r9}\n\t" - "subs r5, r5, #16\n\t" + "ldm sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "stm %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "subs r5, r5, #32\n\t" "bgt L_sp_1024_sqr_32_store_%=\n\t" : [r] "+r" (r), [a] "+r" (a) : diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index 2ba0058e9..0533f9396 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -55,6 +55,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c index 1873ef373..f7aeb1055 100644 --- a/wolfcrypt/src/sp_armthumb.c +++ b/wolfcrypt/src/sp_armthumb.c @@ -55,6 +55,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index 37a7ea28d..8baa651ea 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -59,6 +59,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index d1888cd47..f6aef0ebb 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -59,6 +59,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index 48263c751..af612beba 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -55,6 +55,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm @@ -2222,7 +2223,7 @@ static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x100\n\t" "\n" - "L_sp_2048_add_64_word_%=:\n\t" + "L_sp_2048_add_64_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -2235,9 +2236,9 @@ static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_2048_add_64_word_%=\n\t" + "BNE L_sp_2048_add_64_word\n\t" #else - "BNE.N L_sp_2048_add_64_word_%=\n\t" + "BNE.N L_sp_2048_add_64_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -2269,7 +2270,7 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x100\n\t" "\n" - "L_sp_2048_sub_in_pkace_64_word_%=:\n\t" + "L_sp_2048_sub_in_pkace_64_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -2281,9 +2282,9 @@ static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_2048_sub_in_pkace_64_word_%=\n\t" + "BNE L_sp_2048_sub_in_pkace_64_word\n\t" #else - "BNE.N L_sp_2048_sub_in_pkace_64_word_%=\n\t" + "BNE.N L_sp_2048_sub_in_pkace_64_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -2315,61 +2316,80 @@ static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x200\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_2048_mul_64_outer_%=:\n\t" + "L_sp_2048_mul_64_outer:\n\t" "SUBS r3, r5, #0xfc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_2048_mul_64_inner_%=:\n\t" + "L_sp_2048_mul_64_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x100\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_mul_64_inner_done_%=\n\t" + "BGT L_sp_2048_mul_64_inner_done\n\t" #else - "BEQ.N L_sp_2048_mul_64_inner_done_%=\n\t" + "BGT.N L_sp_2048_mul_64_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_64_inner_%=\n\t" + "BLT L_sp_2048_mul_64_inner\n\t" #else - "BLE.N L_sp_2048_mul_64_inner_%=\n\t" + "BLT.N L_sp_2048_mul_64_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_2048_mul_64_inner_done_%=:\n\t" + "L_sp_2048_mul_64_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x1f8\n\t" + "CMP r5, #0x1f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_64_outer_%=\n\t" + "BLE L_sp_2048_mul_64_outer\n\t" #else - "BLE.N L_sp_2048_mul_64_outer_%=\n\t" + "BLE.N L_sp_2048_mul_64_outer\n\t" #endif + "LDR lr, [%[a], #252]\n\t" + "LDR r11, [%[b], #252]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_2048_mul_64_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_2048_mul_64_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_mul_64_store_%=\n\t" + "BGT L_sp_2048_mul_64_store\n\t" #else - "BGT.N L_sp_2048_mul_64_store_%=\n\t" + "BGT.N L_sp_2048_mul_64_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -2395,24 +2415,20 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x200\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_2048_sqr_64_outer_%=:\n\t" + "L_sp_2048_sqr_64_outer:\n\t" "SUBS r3, r5, #0xfc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_2048_sqr_64_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_64_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_2048_sqr_64_op_sqr_%=\n\t" -#endif + "L_sp_2048_sqr_64_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -2422,59 +2438,51 @@ static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_2048_sqr_64_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_64_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_2048_sqr_64_inner_done\n\t" +#else + "BGT.N L_sp_2048_sqr_64_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_2048_sqr_64_inner\n\t" +#else + "BLT.N L_sp_2048_sqr_64_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_2048_sqr_64_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x100\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_64_inner_done_%=\n\t" -#else - "BEQ.N L_sp_2048_sqr_64_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_sqr_64_inner_done_%=\n\t" -#else - "BGT.N L_sp_2048_sqr_64_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_64_inner_%=\n\t" -#else - "BLE.N L_sp_2048_sqr_64_inner_%=\n\t" -#endif - "\n" - "L_sp_2048_sqr_64_inner_done_%=:\n\t" + "L_sp_2048_sqr_64_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x1f8\n\t" + "CMP r5, #0x1f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_64_outer_%=\n\t" + "BLE L_sp_2048_sqr_64_outer\n\t" #else - "BLE.N L_sp_2048_sqr_64_outer_%=\n\t" + "BLE.N L_sp_2048_sqr_64_outer\n\t" #endif + "LDR lr, [%[a], #252]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_2048_sqr_64_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_2048_sqr_64_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_sqr_64_store_%=\n\t" + "BGT L_sp_2048_sqr_64_store\n\t" #else - "BGT.N L_sp_2048_sqr_64_store_%=\n\t" + "BGT.N L_sp_2048_sqr_64_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -2524,7 +2532,7 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x80\n\t" "\n" - "L_sp_2048_add_32_word_%=:\n\t" + "L_sp_2048_add_32_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -2537,9 +2545,9 @@ static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_2048_add_32_word_%=\n\t" + "BNE L_sp_2048_add_32_word\n\t" #else - "BNE.N L_sp_2048_add_32_word_%=\n\t" + "BNE.N L_sp_2048_add_32_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -2571,7 +2579,7 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x80\n\t" "\n" - "L_sp_2048_sub_in_pkace_32_word_%=:\n\t" + "L_sp_2048_sub_in_pkace_32_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -2583,9 +2591,9 @@ static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_2048_sub_in_pkace_32_word_%=\n\t" + "BNE L_sp_2048_sub_in_pkace_32_word\n\t" #else - "BNE.N L_sp_2048_sub_in_pkace_32_word_%=\n\t" + "BNE.N L_sp_2048_sub_in_pkace_32_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -2617,61 +2625,80 @@ static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_2048_mul_32_outer_%=:\n\t" + "L_sp_2048_mul_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_2048_mul_32_inner_%=:\n\t" + "L_sp_2048_mul_32_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_mul_32_inner_done_%=\n\t" + "BGT L_sp_2048_mul_32_inner_done\n\t" #else - "BEQ.N L_sp_2048_mul_32_inner_done_%=\n\t" + "BGT.N L_sp_2048_mul_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_32_inner_%=\n\t" + "BLT L_sp_2048_mul_32_inner\n\t" #else - "BLE.N L_sp_2048_mul_32_inner_%=\n\t" + "BLT.N L_sp_2048_mul_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_2048_mul_32_inner_done_%=:\n\t" + "L_sp_2048_mul_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_mul_32_outer_%=\n\t" + "BLE L_sp_2048_mul_32_outer\n\t" #else - "BLE.N L_sp_2048_mul_32_outer_%=\n\t" + "BLE.N L_sp_2048_mul_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "LDR r11, [%[b], #124]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_2048_mul_32_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_2048_mul_32_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_mul_32_store_%=\n\t" + "BGT L_sp_2048_mul_32_store\n\t" #else - "BGT.N L_sp_2048_mul_32_store_%=\n\t" + "BGT.N L_sp_2048_mul_32_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -2697,24 +2724,20 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_2048_sqr_32_outer_%=:\n\t" + "L_sp_2048_sqr_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_2048_sqr_32_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_32_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_2048_sqr_32_op_sqr_%=\n\t" -#endif + "L_sp_2048_sqr_32_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -2724,59 +2747,51 @@ static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_2048_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_2048_sqr_32_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_2048_sqr_32_inner_done\n\t" +#else + "BGT.N L_sp_2048_sqr_32_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_2048_sqr_32_inner\n\t" +#else + "BLT.N L_sp_2048_sqr_32_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_2048_sqr_32_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_2048_sqr_32_inner_done_%=\n\t" -#else - "BEQ.N L_sp_2048_sqr_32_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_sqr_32_inner_done_%=\n\t" -#else - "BGT.N L_sp_2048_sqr_32_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_32_inner_%=\n\t" -#else - "BLE.N L_sp_2048_sqr_32_inner_%=\n\t" -#endif - "\n" - "L_sp_2048_sqr_32_inner_done_%=:\n\t" + "L_sp_2048_sqr_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_2048_sqr_32_outer_%=\n\t" + "BLE L_sp_2048_sqr_32_outer\n\t" #else - "BLE.N L_sp_2048_sqr_32_outer_%=\n\t" + "BLE.N L_sp_2048_sqr_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_2048_sqr_32_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_2048_sqr_32_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_2048_sqr_32_store_%=\n\t" + "BGT L_sp_2048_sqr_32_store\n\t" #else - "BGT.N L_sp_2048_sqr_32_store_%=\n\t" + "BGT.N L_sp_2048_sqr_32_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -2835,7 +2850,7 @@ static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_2048_mul_d_64_word_%=:\n\t" + "L_sp_2048_mul_d_64_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -2849,9 +2864,9 @@ static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mul_d_64_word_%=\n\t" + "BLT L_sp_2048_mul_d_64_word\n\t" #else - "BLT.N L_sp_2048_mul_d_64_word_%=\n\t" + "BLT.N L_sp_2048_mul_d_64_word\n\t" #endif "STR r3, [%[r], #256]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -3249,7 +3264,7 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_2048_cond_sub_32_words_%=:\n\t" + "L_sp_2048_cond_sub_32_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -3260,9 +3275,9 @@ static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig "ADD r5, r5, #0x4\n\t" "CMP r5, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_cond_sub_32_words_%=\n\t" + "BLT L_sp_2048_cond_sub_32_words\n\t" #else - "BLT.N L_sp_2048_cond_sub_32_words_%=\n\t" + "BLT.N L_sp_2048_cond_sub_32_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -3445,7 +3460,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_2048_mont_reduce_32_word_%=:\n\t" + "L_sp_2048_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -3708,9 +3723,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x80\n\t" #ifdef __GNUC__ - "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_word\n\t" #else - "BLT.W L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT.W L_sp_2048_mont_reduce_32_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -3749,7 +3764,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_32_word_%=:\n\t" + "L_sp_2048_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -3757,7 +3772,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_32_mul_%=:\n\t" + "L_sp_2048_mont_reduce_32_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -3800,9 +3815,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_32_mul_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_mul\n\t" #else - "BLT.N L_sp_2048_mont_reduce_32_mul_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_32_mul\n\t" #endif "LDR r10, [%[a], #128]\n\t" "ADDS r4, r4, r3\n\t" @@ -3816,9 +3831,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_word\n\t" #else - "BLT.N L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_32_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -3860,7 +3875,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_2048_mont_reduce_32_word_%=:\n\t" + "L_sp_2048_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -4028,9 +4043,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x80\n\t" #ifdef __GNUC__ - "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_word\n\t" #else - "BLT.W L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT.W L_sp_2048_mont_reduce_32_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -4072,7 +4087,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_32_word_%=:\n\t" + "L_sp_2048_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -4080,7 +4095,7 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_32_mul_%=:\n\t" + "L_sp_2048_mont_reduce_32_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -4111,9 +4126,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_32_mul_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_mul\n\t" #else - "BLT.N L_sp_2048_mont_reduce_32_mul_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_32_mul\n\t" #endif "LDR r10, [%[a], #128]\n\t" "ADDS r4, r4, r3\n\t" @@ -4127,9 +4142,9 @@ static void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_32_word\n\t" #else - "BLT.N L_sp_2048_mont_reduce_32_word_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_32_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -4200,7 +4215,7 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_2048_mul_d_32_word_%=:\n\t" + "L_sp_2048_mul_d_32_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -4214,9 +4229,9 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mul_d_32_word_%=\n\t" + "BLT L_sp_2048_mul_d_32_word\n\t" #else - "BLT.N L_sp_2048_mul_d_32_word_%=\n\t" + "BLT.N L_sp_2048_mul_d_32_word\n\t" #endif "STR r3, [%[r], #128]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -4423,9 +4438,9 @@ static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -4488,9 +4503,9 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -4514,7 +4529,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_2048_word_32_bit_%=:\n\t" + "L_div_2048_word_32_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -4524,7 +4539,7 @@ static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_2048_word_32_bit_%=\n\t" + "bpl L_div_2048_word_32_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -4576,7 +4591,7 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x7c\n\t" "\n" - "L_sp_2048_cmp_32_words_%=:\n\t" + "L_sp_2048_cmp_32_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -4589,7 +4604,7 @@ static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_2048_cmp_32_words_%=\n\t" + "bcs L_sp_2048_cmp_32_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #124]\n\t" @@ -5377,7 +5392,7 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_2048_cond_sub_64_words_%=:\n\t" + "L_sp_2048_cond_sub_64_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -5388,9 +5403,9 @@ static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_dig "ADD r5, r5, #0x4\n\t" "CMP r5, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_cond_sub_64_words_%=\n\t" + "BLT L_sp_2048_cond_sub_64_words\n\t" #else - "BLT.N L_sp_2048_cond_sub_64_words_%=\n\t" + "BLT.N L_sp_2048_cond_sub_64_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -5685,7 +5700,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_2048_mont_reduce_64_word_%=:\n\t" + "L_sp_2048_mont_reduce_64_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -6204,9 +6219,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x100\n\t" #ifdef __GNUC__ - "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_word\n\t" #else - "BLT.W L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT.W L_sp_2048_mont_reduce_64_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -6245,7 +6260,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_64_word_%=:\n\t" + "L_sp_2048_mont_reduce_64_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -6253,7 +6268,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_64_mul_%=:\n\t" + "L_sp_2048_mont_reduce_64_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -6296,9 +6311,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_64_mul_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_mul\n\t" #else - "BLT.N L_sp_2048_mont_reduce_64_mul_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_64_mul\n\t" #endif "LDR r10, [%[a], #256]\n\t" "ADDS r4, r4, r3\n\t" @@ -6312,9 +6327,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_word\n\t" #else - "BLT.N L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_64_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -6356,7 +6371,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_2048_mont_reduce_64_word_%=:\n\t" + "L_sp_2048_mont_reduce_64_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -6684,9 +6699,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x100\n\t" #ifdef __GNUC__ - "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_word\n\t" #else - "BLT.W L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT.W L_sp_2048_mont_reduce_64_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -6728,7 +6743,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_64_word_%=:\n\t" + "L_sp_2048_mont_reduce_64_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -6736,7 +6751,7 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_2048_mont_reduce_64_mul_%=:\n\t" + "L_sp_2048_mont_reduce_64_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -6767,9 +6782,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_64_mul_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_mul\n\t" #else - "BLT.N L_sp_2048_mont_reduce_64_mul_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_64_mul\n\t" #endif "LDR r10, [%[a], #256]\n\t" "ADDS r4, r4, r3\n\t" @@ -6783,9 +6798,9 @@ static void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT L_sp_2048_mont_reduce_64_word\n\t" #else - "BLT.N L_sp_2048_mont_reduce_64_word_%=\n\t" + "BLT.N L_sp_2048_mont_reduce_64_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -6851,7 +6866,7 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x100\n\t" "\n" - "L_sp_2048_sub_64_word_%=:\n\t" + "L_sp_2048_sub_64_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -6863,9 +6878,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_2048_sub_64_word_%=\n\t" + "BNE L_sp_2048_sub_64_word\n\t" #else - "BNE.N L_sp_2048_sub_64_word_%=\n\t" + "BNE.N L_sp_2048_sub_64_word\n\t" #endif "MOV %[r], r11\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -7027,9 +7042,9 @@ static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -7092,9 +7107,9 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -7118,7 +7133,7 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_2048_word_64_bit_%=:\n\t" + "L_div_2048_word_64_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -7128,7 +7143,7 @@ static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_2048_word_64_bit_%=\n\t" + "bpl L_div_2048_word_64_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -7283,7 +7298,7 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0xfc\n\t" "\n" - "L_sp_2048_cmp_64_words_%=:\n\t" + "L_sp_2048_cmp_64_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -7296,7 +7311,7 @@ static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_2048_cmp_64_words_%=\n\t" + "bcs L_sp_2048_cmp_64_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #252]\n\t" @@ -8559,7 +8574,7 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "MOV r8, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_2048_cond_add_32_words_%=:\n\t" + "L_sp_2048_cond_add_32_words:\n\t" "ADDS r5, r5, #0xffffffff\n\t" "LDR r6, [%[a], r4]\n\t" "LDR r7, [%[b], r4]\n\t" @@ -8570,9 +8585,9 @@ static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "ADD r4, r4, #0x4\n\t" "CMP r4, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_2048_cond_add_32_words_%=\n\t" + "BLT L_sp_2048_cond_add_32_words\n\t" #else - "BLT.N L_sp_2048_cond_add_32_words_%=\n\t" + "BLT.N L_sp_2048_cond_add_32_words\n\t" #endif "MOV %[r], r5\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -12945,7 +12960,7 @@ static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x180\n\t" "\n" - "L_sp_3072_add_96_word_%=:\n\t" + "L_sp_3072_add_96_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -12958,9 +12973,9 @@ static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_3072_add_96_word_%=\n\t" + "BNE L_sp_3072_add_96_word\n\t" #else - "BNE.N L_sp_3072_add_96_word_%=\n\t" + "BNE.N L_sp_3072_add_96_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -12992,7 +13007,7 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x180\n\t" "\n" - "L_sp_3072_sub_in_pkace_96_word_%=:\n\t" + "L_sp_3072_sub_in_pkace_96_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -13004,9 +13019,9 @@ static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_3072_sub_in_pkace_96_word_%=\n\t" + "BNE L_sp_3072_sub_in_pkace_96_word\n\t" #else - "BNE.N L_sp_3072_sub_in_pkace_96_word_%=\n\t" + "BNE.N L_sp_3072_sub_in_pkace_96_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -13038,61 +13053,80 @@ static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x300\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_3072_mul_96_outer_%=:\n\t" + "L_sp_3072_mul_96_outer:\n\t" "SUBS r3, r5, #0x17c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_3072_mul_96_inner_%=:\n\t" + "L_sp_3072_mul_96_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x180\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_mul_96_inner_done_%=\n\t" + "BGT L_sp_3072_mul_96_inner_done\n\t" #else - "BEQ.N L_sp_3072_mul_96_inner_done_%=\n\t" + "BGT.N L_sp_3072_mul_96_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_96_inner_%=\n\t" + "BLT L_sp_3072_mul_96_inner\n\t" #else - "BLE.N L_sp_3072_mul_96_inner_%=\n\t" + "BLT.N L_sp_3072_mul_96_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_3072_mul_96_inner_done_%=:\n\t" + "L_sp_3072_mul_96_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x2f8\n\t" + "CMP r5, #0x2f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_96_outer_%=\n\t" + "BLE L_sp_3072_mul_96_outer\n\t" #else - "BLE.N L_sp_3072_mul_96_outer_%=\n\t" + "BLE.N L_sp_3072_mul_96_outer\n\t" #endif + "LDR lr, [%[a], #380]\n\t" + "LDR r11, [%[b], #380]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_3072_mul_96_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_3072_mul_96_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_mul_96_store_%=\n\t" + "BGT L_sp_3072_mul_96_store\n\t" #else - "BGT.N L_sp_3072_mul_96_store_%=\n\t" + "BGT.N L_sp_3072_mul_96_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -13118,24 +13152,20 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x300\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_3072_sqr_96_outer_%=:\n\t" + "L_sp_3072_sqr_96_outer:\n\t" "SUBS r3, r5, #0x17c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_3072_sqr_96_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_96_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_3072_sqr_96_op_sqr_%=\n\t" -#endif + "L_sp_3072_sqr_96_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -13145,59 +13175,51 @@ static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_3072_sqr_96_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_96_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_3072_sqr_96_inner_done\n\t" +#else + "BGT.N L_sp_3072_sqr_96_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_3072_sqr_96_inner\n\t" +#else + "BLT.N L_sp_3072_sqr_96_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_3072_sqr_96_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x180\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_96_inner_done_%=\n\t" -#else - "BEQ.N L_sp_3072_sqr_96_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_sqr_96_inner_done_%=\n\t" -#else - "BGT.N L_sp_3072_sqr_96_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_96_inner_%=\n\t" -#else - "BLE.N L_sp_3072_sqr_96_inner_%=\n\t" -#endif - "\n" - "L_sp_3072_sqr_96_inner_done_%=:\n\t" + "L_sp_3072_sqr_96_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x2f8\n\t" + "CMP r5, #0x2f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_96_outer_%=\n\t" + "BLE L_sp_3072_sqr_96_outer\n\t" #else - "BLE.N L_sp_3072_sqr_96_outer_%=\n\t" + "BLE.N L_sp_3072_sqr_96_outer\n\t" #endif + "LDR lr, [%[a], #380]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_3072_sqr_96_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_3072_sqr_96_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_sqr_96_store_%=\n\t" + "BGT L_sp_3072_sqr_96_store\n\t" #else - "BGT.N L_sp_3072_sqr_96_store_%=\n\t" + "BGT.N L_sp_3072_sqr_96_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -13247,7 +13269,7 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r3, #0x0\n\t" "ADD r12, %[a], #0xc0\n\t" "\n" - "L_sp_3072_add_48_word_%=:\n\t" + "L_sp_3072_add_48_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -13260,9 +13282,9 @@ static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_3072_add_48_word_%=\n\t" + "BNE L_sp_3072_add_48_word\n\t" #else - "BNE.N L_sp_3072_add_48_word_%=\n\t" + "BNE.N L_sp_3072_add_48_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -13294,7 +13316,7 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0xc0\n\t" "\n" - "L_sp_3072_sub_in_pkace_48_word_%=:\n\t" + "L_sp_3072_sub_in_pkace_48_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -13306,9 +13328,9 @@ static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_3072_sub_in_pkace_48_word_%=\n\t" + "BNE L_sp_3072_sub_in_pkace_48_word\n\t" #else - "BNE.N L_sp_3072_sub_in_pkace_48_word_%=\n\t" + "BNE.N L_sp_3072_sub_in_pkace_48_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -13340,61 +13362,80 @@ static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x180\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_3072_mul_48_outer_%=:\n\t" + "L_sp_3072_mul_48_outer:\n\t" "SUBS r3, r5, #0xbc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_3072_mul_48_inner_%=:\n\t" + "L_sp_3072_mul_48_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0xc0\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_mul_48_inner_done_%=\n\t" + "BGT L_sp_3072_mul_48_inner_done\n\t" #else - "BEQ.N L_sp_3072_mul_48_inner_done_%=\n\t" + "BGT.N L_sp_3072_mul_48_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_48_inner_%=\n\t" + "BLT L_sp_3072_mul_48_inner\n\t" #else - "BLE.N L_sp_3072_mul_48_inner_%=\n\t" + "BLT.N L_sp_3072_mul_48_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_3072_mul_48_inner_done_%=:\n\t" + "L_sp_3072_mul_48_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x178\n\t" + "CMP r5, #0x174\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_mul_48_outer_%=\n\t" + "BLE L_sp_3072_mul_48_outer\n\t" #else - "BLE.N L_sp_3072_mul_48_outer_%=\n\t" + "BLE.N L_sp_3072_mul_48_outer\n\t" #endif + "LDR lr, [%[a], #188]\n\t" + "LDR r11, [%[b], #188]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_3072_mul_48_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_3072_mul_48_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_mul_48_store_%=\n\t" + "BGT L_sp_3072_mul_48_store\n\t" #else - "BGT.N L_sp_3072_mul_48_store_%=\n\t" + "BGT.N L_sp_3072_mul_48_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -13420,24 +13461,20 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x180\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_3072_sqr_48_outer_%=:\n\t" + "L_sp_3072_sqr_48_outer:\n\t" "SUBS r3, r5, #0xbc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_3072_sqr_48_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_48_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_3072_sqr_48_op_sqr_%=\n\t" -#endif + "L_sp_3072_sqr_48_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -13447,59 +13484,51 @@ static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_3072_sqr_48_op_done_%=\n\t" - "\n" - "L_sp_3072_sqr_48_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_3072_sqr_48_inner_done\n\t" +#else + "BGT.N L_sp_3072_sqr_48_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_3072_sqr_48_inner\n\t" +#else + "BLT.N L_sp_3072_sqr_48_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_3072_sqr_48_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0xc0\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_3072_sqr_48_inner_done_%=\n\t" -#else - "BEQ.N L_sp_3072_sqr_48_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_sqr_48_inner_done_%=\n\t" -#else - "BGT.N L_sp_3072_sqr_48_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_48_inner_%=\n\t" -#else - "BLE.N L_sp_3072_sqr_48_inner_%=\n\t" -#endif - "\n" - "L_sp_3072_sqr_48_inner_done_%=:\n\t" + "L_sp_3072_sqr_48_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x178\n\t" + "CMP r5, #0x174\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_3072_sqr_48_outer_%=\n\t" + "BLE L_sp_3072_sqr_48_outer\n\t" #else - "BLE.N L_sp_3072_sqr_48_outer_%=\n\t" + "BLE.N L_sp_3072_sqr_48_outer\n\t" #endif + "LDR lr, [%[a], #188]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_3072_sqr_48_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_3072_sqr_48_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_3072_sqr_48_store_%=\n\t" + "BGT L_sp_3072_sqr_48_store\n\t" #else - "BGT.N L_sp_3072_sqr_48_store_%=\n\t" + "BGT.N L_sp_3072_sqr_48_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -13558,7 +13587,7 @@ static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_3072_mul_d_96_word_%=:\n\t" + "L_sp_3072_mul_d_96_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -13572,9 +13601,9 @@ static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mul_d_96_word_%=\n\t" + "BLT L_sp_3072_mul_d_96_word\n\t" #else - "BLT.N L_sp_3072_mul_d_96_word_%=\n\t" + "BLT.N L_sp_3072_mul_d_96_word\n\t" #endif "STR r3, [%[r], #384]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -14132,7 +14161,7 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_3072_cond_sub_48_words_%=:\n\t" + "L_sp_3072_cond_sub_48_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -14143,9 +14172,9 @@ static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_dig "ADD r5, r5, #0x4\n\t" "CMP r5, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_cond_sub_48_words_%=\n\t" + "BLT L_sp_3072_cond_sub_48_words\n\t" #else - "BLT.N L_sp_3072_cond_sub_48_words_%=\n\t" + "BLT.N L_sp_3072_cond_sub_48_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -14384,7 +14413,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_3072_mont_reduce_48_word_%=:\n\t" + "L_sp_3072_mont_reduce_48_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -14775,9 +14804,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0xc0\n\t" #ifdef __GNUC__ - "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_word\n\t" #else - "BLT.W L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT.W L_sp_3072_mont_reduce_48_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -14816,7 +14845,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_48_word_%=:\n\t" + "L_sp_3072_mont_reduce_48_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -14824,7 +14853,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_48_mul_%=:\n\t" + "L_sp_3072_mont_reduce_48_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -14867,9 +14896,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_48_mul_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_mul\n\t" #else - "BLT.N L_sp_3072_mont_reduce_48_mul_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_48_mul\n\t" #endif "LDR r10, [%[a], #192]\n\t" "ADDS r4, r4, r3\n\t" @@ -14883,9 +14912,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_word\n\t" #else - "BLT.N L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_48_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -14927,7 +14956,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_3072_mont_reduce_48_word_%=:\n\t" + "L_sp_3072_mont_reduce_48_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -15175,9 +15204,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0xc0\n\t" #ifdef __GNUC__ - "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_word\n\t" #else - "BLT.W L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT.W L_sp_3072_mont_reduce_48_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -15219,7 +15248,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_48_word_%=:\n\t" + "L_sp_3072_mont_reduce_48_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -15227,7 +15256,7 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_48_mul_%=:\n\t" + "L_sp_3072_mont_reduce_48_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -15258,9 +15287,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_48_mul_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_mul\n\t" #else - "BLT.N L_sp_3072_mont_reduce_48_mul_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_48_mul\n\t" #endif "LDR r10, [%[a], #192]\n\t" "ADDS r4, r4, r3\n\t" @@ -15274,9 +15303,9 @@ static void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_48_word\n\t" #else - "BLT.N L_sp_3072_mont_reduce_48_word_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_48_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -15347,7 +15376,7 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_3072_mul_d_48_word_%=:\n\t" + "L_sp_3072_mul_d_48_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -15361,9 +15390,9 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mul_d_48_word_%=\n\t" + "BLT L_sp_3072_mul_d_48_word\n\t" #else - "BLT.N L_sp_3072_mul_d_48_word_%=\n\t" + "BLT.N L_sp_3072_mul_d_48_word\n\t" #endif "STR r3, [%[r], #192]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -15650,9 +15679,9 @@ static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -15715,9 +15744,9 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -15741,7 +15770,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_3072_word_48_bit_%=:\n\t" + "L_div_3072_word_48_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -15751,7 +15780,7 @@ static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_3072_word_48_bit_%=\n\t" + "bpl L_div_3072_word_48_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -15803,7 +15832,7 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0xbc\n\t" "\n" - "L_sp_3072_cmp_48_words_%=:\n\t" + "L_sp_3072_cmp_48_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -15816,7 +15845,7 @@ static sp_int32 sp_3072_cmp_48(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_3072_cmp_48_words_%=\n\t" + "bcs L_sp_3072_cmp_48_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #188]\n\t" @@ -16780,7 +16809,7 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_dig "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_3072_cond_sub_96_words_%=:\n\t" + "L_sp_3072_cond_sub_96_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -16791,9 +16820,9 @@ static sp_digit sp_3072_cond_sub_96(sp_digit* r, const sp_digit* a, const sp_dig "ADD r5, r5, #0x4\n\t" "CMP r5, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_cond_sub_96_words_%=\n\t" + "BLT L_sp_3072_cond_sub_96_words\n\t" #else - "BLT.N L_sp_3072_cond_sub_96_words_%=\n\t" + "BLT.N L_sp_3072_cond_sub_96_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -17200,7 +17229,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_3072_mont_reduce_96_word_%=:\n\t" + "L_sp_3072_mont_reduce_96_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -17975,9 +18004,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x180\n\t" #ifdef __GNUC__ - "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_word\n\t" #else - "BLT.W L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT.W L_sp_3072_mont_reduce_96_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -18016,7 +18045,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_96_word_%=:\n\t" + "L_sp_3072_mont_reduce_96_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -18024,7 +18053,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_96_mul_%=:\n\t" + "L_sp_3072_mont_reduce_96_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -18067,9 +18096,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_96_mul_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_mul\n\t" #else - "BLT.N L_sp_3072_mont_reduce_96_mul_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_96_mul\n\t" #endif "LDR r10, [%[a], #384]\n\t" "ADDS r4, r4, r3\n\t" @@ -18083,9 +18112,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_word\n\t" #else - "BLT.N L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_96_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -18127,7 +18156,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_3072_mont_reduce_96_word_%=:\n\t" + "L_sp_3072_mont_reduce_96_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -18615,9 +18644,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x180\n\t" #ifdef __GNUC__ - "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_word\n\t" #else - "BLT.W L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT.W L_sp_3072_mont_reduce_96_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -18659,7 +18688,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_96_word_%=:\n\t" + "L_sp_3072_mont_reduce_96_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -18667,7 +18696,7 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_3072_mont_reduce_96_mul_%=:\n\t" + "L_sp_3072_mont_reduce_96_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -18698,9 +18727,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_96_mul_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_mul\n\t" #else - "BLT.N L_sp_3072_mont_reduce_96_mul_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_96_mul\n\t" #endif "LDR r10, [%[a], #384]\n\t" "ADDS r4, r4, r3\n\t" @@ -18714,9 +18743,9 @@ static void sp_3072_mont_reduce_96(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x180\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT L_sp_3072_mont_reduce_96_word\n\t" #else - "BLT.N L_sp_3072_mont_reduce_96_word_%=\n\t" + "BLT.N L_sp_3072_mont_reduce_96_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -18782,7 +18811,7 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x180\n\t" "\n" - "L_sp_3072_sub_96_word_%=:\n\t" + "L_sp_3072_sub_96_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -18794,9 +18823,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_3072_sub_96_word_%=\n\t" + "BNE L_sp_3072_sub_96_word\n\t" #else - "BNE.N L_sp_3072_sub_96_word_%=\n\t" + "BNE.N L_sp_3072_sub_96_word\n\t" #endif "MOV %[r], r11\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -19014,9 +19043,9 @@ static sp_digit sp_3072_sub_96(sp_digit* r, const sp_digit* a, const sp_digit* b * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -19079,9 +19108,9 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -19105,7 +19134,7 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_3072_word_96_bit_%=:\n\t" + "L_div_3072_word_96_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -19115,7 +19144,7 @@ static sp_digit div_3072_word_96(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_3072_word_96_bit_%=\n\t" + "bpl L_div_3072_word_96_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -19270,7 +19299,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x17c\n\t" "\n" - "L_sp_3072_cmp_96_words_%=:\n\t" + "L_sp_3072_cmp_96_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -19283,7 +19312,7 @@ static sp_int32 sp_3072_cmp_96(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_3072_cmp_96_words_%=\n\t" + "bcs L_sp_3072_cmp_96_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #380]\n\t" @@ -20898,7 +20927,7 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig "MOV r8, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_3072_cond_add_48_words_%=:\n\t" + "L_sp_3072_cond_add_48_words:\n\t" "ADDS r5, r5, #0xffffffff\n\t" "LDR r6, [%[a], r4]\n\t" "LDR r7, [%[b], r4]\n\t" @@ -20909,9 +20938,9 @@ static sp_digit sp_3072_cond_add_48(sp_digit* r, const sp_digit* a, const sp_dig "ADD r4, r4, #0x4\n\t" "CMP r4, #0xc0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_3072_cond_add_48_words_%=\n\t" + "BLT L_sp_3072_cond_add_48_words\n\t" #else - "BLT.N L_sp_3072_cond_add_48_words_%=\n\t" + "BLT.N L_sp_3072_cond_add_48_words\n\t" #endif "MOV %[r], r5\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -23042,7 +23071,7 @@ static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit* "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x200\n\t" "\n" - "L_sp_4096_add_128_word_%=:\n\t" + "L_sp_4096_add_128_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -23055,9 +23084,9 @@ static sp_digit sp_4096_add_128(sp_digit* r, const sp_digit* a, const sp_digit* "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_4096_add_128_word_%=\n\t" + "BNE L_sp_4096_add_128_word\n\t" #else - "BNE.N L_sp_4096_add_128_word_%=\n\t" + "BNE.N L_sp_4096_add_128_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -23089,7 +23118,7 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x200\n\t" "\n" - "L_sp_4096_sub_in_pkace_128_word_%=:\n\t" + "L_sp_4096_sub_in_pkace_128_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -23101,9 +23130,9 @@ static sp_digit sp_4096_sub_in_place_128(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_4096_sub_in_pkace_128_word_%=\n\t" + "BNE L_sp_4096_sub_in_pkace_128_word\n\t" #else - "BNE.N L_sp_4096_sub_in_pkace_128_word_%=\n\t" + "BNE.N L_sp_4096_sub_in_pkace_128_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -23135,61 +23164,80 @@ static void sp_4096_mul_128(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x400\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_4096_mul_128_outer_%=:\n\t" + "L_sp_4096_mul_128_outer:\n\t" "SUBS r3, r5, #0x1fc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_4096_mul_128_inner_%=:\n\t" + "L_sp_4096_mul_128_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x200\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_mul_128_inner_done_%=\n\t" + "BGT L_sp_4096_mul_128_inner_done\n\t" #else - "BEQ.N L_sp_4096_mul_128_inner_done_%=\n\t" + "BGT.N L_sp_4096_mul_128_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_mul_128_inner_%=\n\t" + "BLT L_sp_4096_mul_128_inner\n\t" #else - "BLE.N L_sp_4096_mul_128_inner_%=\n\t" + "BLT.N L_sp_4096_mul_128_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_4096_mul_128_inner_done_%=:\n\t" + "L_sp_4096_mul_128_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x3f8\n\t" + "CMP r5, #0x3f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_mul_128_outer_%=\n\t" + "BLE L_sp_4096_mul_128_outer\n\t" #else - "BLE.N L_sp_4096_mul_128_outer_%=\n\t" + "BLE.N L_sp_4096_mul_128_outer\n\t" #endif + "LDR lr, [%[a], #508]\n\t" + "LDR r11, [%[b], #508]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_4096_mul_128_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_4096_mul_128_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_4096_mul_128_store_%=\n\t" + "BGT L_sp_4096_mul_128_store\n\t" #else - "BGT.N L_sp_4096_mul_128_store_%=\n\t" + "BGT.N L_sp_4096_mul_128_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -23215,24 +23263,20 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x400\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_4096_sqr_128_outer_%=:\n\t" + "L_sp_4096_sqr_128_outer:\n\t" "SUBS r3, r5, #0x1fc\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_4096_sqr_128_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_sqr_128_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_4096_sqr_128_op_sqr_%=\n\t" -#endif + "L_sp_4096_sqr_128_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -23242,59 +23286,51 @@ static void sp_4096_sqr_128(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_4096_sqr_128_op_done_%=\n\t" - "\n" - "L_sp_4096_sqr_128_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_4096_sqr_128_inner_done\n\t" +#else + "BGT.N L_sp_4096_sqr_128_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_4096_sqr_128_inner\n\t" +#else + "BLT.N L_sp_4096_sqr_128_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_4096_sqr_128_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x200\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_4096_sqr_128_inner_done_%=\n\t" -#else - "BEQ.N L_sp_4096_sqr_128_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_4096_sqr_128_inner_done_%=\n\t" -#else - "BGT.N L_sp_4096_sqr_128_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_sqr_128_inner_%=\n\t" -#else - "BLE.N L_sp_4096_sqr_128_inner_%=\n\t" -#endif - "\n" - "L_sp_4096_sqr_128_inner_done_%=:\n\t" + "L_sp_4096_sqr_128_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x3f8\n\t" + "CMP r5, #0x3f4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_4096_sqr_128_outer_%=\n\t" + "BLE L_sp_4096_sqr_128_outer\n\t" #else - "BLE.N L_sp_4096_sqr_128_outer_%=\n\t" + "BLE.N L_sp_4096_sqr_128_outer\n\t" #endif + "LDR lr, [%[a], #508]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_4096_sqr_128_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_4096_sqr_128_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_4096_sqr_128_store_%=\n\t" + "BGT L_sp_4096_sqr_128_store\n\t" #else - "BGT.N L_sp_4096_sqr_128_store_%=\n\t" + "BGT.N L_sp_4096_sqr_128_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -23351,7 +23387,7 @@ static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_4096_mul_d_128_word_%=:\n\t" + "L_sp_4096_mul_d_128_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -23365,9 +23401,9 @@ static void sp_4096_mul_d_128(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_mul_d_128_word_%=\n\t" + "BLT L_sp_4096_mul_d_128_word\n\t" #else - "BLT.N L_sp_4096_mul_d_128_word_%=\n\t" + "BLT.N L_sp_4096_mul_d_128_word\n\t" #endif "STR r3, [%[r], #512]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -24086,7 +24122,7 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_di "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_4096_cond_sub_128_words_%=:\n\t" + "L_sp_4096_cond_sub_128_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -24097,9 +24133,9 @@ static sp_digit sp_4096_cond_sub_128(sp_digit* r, const sp_digit* a, const sp_di "ADD r5, r5, #0x4\n\t" "CMP r5, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_cond_sub_128_words_%=\n\t" + "BLT L_sp_4096_cond_sub_128_words\n\t" #else - "BLT.N L_sp_4096_cond_sub_128_words_%=\n\t" + "BLT.N L_sp_4096_cond_sub_128_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -24618,7 +24654,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_4096_mont_reduce_128_word_%=:\n\t" + "L_sp_4096_mont_reduce_128_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -25649,9 +25685,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x200\n\t" #ifdef __GNUC__ - "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_word\n\t" #else - "BLT.W L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT.W L_sp_4096_mont_reduce_128_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -25690,7 +25726,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_4096_mont_reduce_128_word_%=:\n\t" + "L_sp_4096_mont_reduce_128_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -25698,7 +25734,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_4096_mont_reduce_128_mul_%=:\n\t" + "L_sp_4096_mont_reduce_128_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -25741,9 +25777,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_mont_reduce_128_mul_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_mul\n\t" #else - "BLT.N L_sp_4096_mont_reduce_128_mul_%=\n\t" + "BLT.N L_sp_4096_mont_reduce_128_mul\n\t" #endif "LDR r10, [%[a], #512]\n\t" "ADDS r4, r4, r3\n\t" @@ -25757,9 +25793,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_word\n\t" #else - "BLT.N L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT.N L_sp_4096_mont_reduce_128_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -25801,7 +25837,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_4096_mont_reduce_128_word_%=:\n\t" + "L_sp_4096_mont_reduce_128_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -26449,9 +26485,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x200\n\t" #ifdef __GNUC__ - "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_word\n\t" #else - "BLT.W L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT.W L_sp_4096_mont_reduce_128_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -26493,7 +26529,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) /* ca = 0 */ "MOV r3, #0x0\n\t" "\n" - "L_sp_4096_mont_reduce_128_word_%=:\n\t" + "L_sp_4096_mont_reduce_128_word:\n\t" /* mu = a[i] * mp */ "LDR r10, [%[a]]\n\t" "MUL r8, %[mp], r10\n\t" @@ -26501,7 +26537,7 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "MOV r12, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_4096_mont_reduce_128_mul_%=:\n\t" + "L_sp_4096_mont_reduce_128_mul:\n\t" /* a[i+j+0] += m[j+0] * mu */ "LDR r7, [%[m], r12]\n\t" "LDR r10, [%[a], r12]\n\t" @@ -26532,9 +26568,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD r12, r12, #0x4\n\t" "CMP r12, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_mont_reduce_128_mul_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_mul\n\t" #else - "BLT.N L_sp_4096_mont_reduce_128_mul_%=\n\t" + "BLT.N L_sp_4096_mont_reduce_128_mul\n\t" #endif "LDR r10, [%[a], #512]\n\t" "ADDS r4, r4, r3\n\t" @@ -26548,9 +26584,9 @@ static void sp_4096_mont_reduce_128(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r9, #0x200\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT L_sp_4096_mont_reduce_128_word\n\t" #else - "BLT.N L_sp_4096_mont_reduce_128_word_%=\n\t" + "BLT.N L_sp_4096_mont_reduce_128_word\n\t" #endif /* Loop Done */ "MOV %[mp], r3\n\t" @@ -26616,7 +26652,7 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x200\n\t" "\n" - "L_sp_4096_sub_128_word_%=:\n\t" + "L_sp_4096_sub_128_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -26628,9 +26664,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_4096_sub_128_word_%=\n\t" + "BNE L_sp_4096_sub_128_word\n\t" #else - "BNE.N L_sp_4096_sub_128_word_%=\n\t" + "BNE.N L_sp_4096_sub_128_word\n\t" #endif "MOV %[r], r11\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -26904,9 +26940,9 @@ static sp_digit sp_4096_sub_128(sp_digit* r, const sp_digit* a, const sp_digit* * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -26969,9 +27005,9 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -26995,7 +27031,7 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_4096_word_128_bit_%=:\n\t" + "L_div_4096_word_128_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -27005,7 +27041,7 @@ static sp_digit div_4096_word_128(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_4096_word_128_bit_%=\n\t" + "bpl L_div_4096_word_128_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -27160,7 +27196,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x1fc\n\t" "\n" - "L_sp_4096_cmp_128_words_%=:\n\t" + "L_sp_4096_cmp_128_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -27173,7 +27209,7 @@ static sp_int32 sp_4096_cmp_128(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_4096_cmp_128_words_%=\n\t" + "bcs L_sp_4096_cmp_128_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #508]\n\t" @@ -29140,7 +29176,7 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig "MOV r8, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_4096_cond_add_64_words_%=:\n\t" + "L_sp_4096_cond_add_64_words:\n\t" "ADDS r5, r5, #0xffffffff\n\t" "LDR r6, [%[a], r4]\n\t" "LDR r7, [%[b], r4]\n\t" @@ -29151,9 +29187,9 @@ static sp_digit sp_4096_cond_add_64(sp_digit* r, const sp_digit* a, const sp_dig "ADD r4, r4, #0x4\n\t" "CMP r4, #0x100\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_4096_cond_add_64_words_%=\n\t" + "BLT L_sp_4096_cond_add_64_words\n\t" #else - "BLT.N L_sp_4096_cond_add_64_words_%=\n\t" + "BLT.N L_sp_4096_cond_add_64_words\n\t" #endif "MOV %[r], r5\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -30825,61 +30861,80 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x40\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_256_mul_8_outer_%=:\n\t" + "L_sp_256_mul_8_outer:\n\t" "SUBS r3, r5, #0x1c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_256_mul_8_inner_%=:\n\t" + "L_sp_256_mul_8_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x20\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_mul_8_inner_done_%=\n\t" + "BGT L_sp_256_mul_8_inner_done\n\t" #else - "BEQ.N L_sp_256_mul_8_inner_done_%=\n\t" + "BGT.N L_sp_256_mul_8_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_mul_8_inner_%=\n\t" + "BLT L_sp_256_mul_8_inner\n\t" #else - "BLE.N L_sp_256_mul_8_inner_%=\n\t" + "BLT.N L_sp_256_mul_8_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_256_mul_8_inner_done_%=:\n\t" + "L_sp_256_mul_8_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x38\n\t" + "CMP r5, #0x34\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_mul_8_outer_%=\n\t" + "BLE L_sp_256_mul_8_outer\n\t" #else - "BLE.N L_sp_256_mul_8_outer_%=\n\t" + "BLE.N L_sp_256_mul_8_outer\n\t" #endif + "LDR lr, [%[a], #28]\n\t" + "LDR r11, [%[b], #28]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_256_mul_8_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_256_mul_8_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_256_mul_8_store_%=\n\t" + "BGT L_sp_256_mul_8_store\n\t" #else - "BGT.N L_sp_256_mul_8_store_%=\n\t" + "BGT.N L_sp_256_mul_8_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -31411,24 +31466,20 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x40\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_256_sqr_8_outer_%=:\n\t" + "L_sp_256_sqr_8_outer:\n\t" "SUBS r3, r5, #0x1c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_256_sqr_8_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_sqr_8_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_256_sqr_8_op_sqr_%=\n\t" -#endif + "L_sp_256_sqr_8_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -31438,59 +31489,51 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_256_sqr_8_op_done_%=\n\t" - "\n" - "L_sp_256_sqr_8_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_256_sqr_8_inner_done\n\t" +#else + "BGT.N L_sp_256_sqr_8_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_256_sqr_8_inner\n\t" +#else + "BLT.N L_sp_256_sqr_8_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_256_sqr_8_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x20\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_sqr_8_inner_done_%=\n\t" -#else - "BEQ.N L_sp_256_sqr_8_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_256_sqr_8_inner_done_%=\n\t" -#else - "BGT.N L_sp_256_sqr_8_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_sqr_8_inner_%=\n\t" -#else - "BLE.N L_sp_256_sqr_8_inner_%=\n\t" -#endif - "\n" - "L_sp_256_sqr_8_inner_done_%=:\n\t" + "L_sp_256_sqr_8_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x38\n\t" + "CMP r5, #0x34\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_256_sqr_8_outer_%=\n\t" + "BLE L_sp_256_sqr_8_outer\n\t" #else - "BLE.N L_sp_256_sqr_8_outer_%=\n\t" + "BLE.N L_sp_256_sqr_8_outer\n\t" #endif + "LDR lr, [%[a], #28]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_256_sqr_8_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_256_sqr_8_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_256_sqr_8_store_%=\n\t" + "BGT L_sp_256_sqr_8_store\n\t" #else - "BGT.N L_sp_256_sqr_8_store_%=\n\t" + "BGT.N L_sp_256_sqr_8_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -31896,7 +31939,7 @@ static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x20\n\t" "\n" - "L_sp_256_add_8_word_%=:\n\t" + "L_sp_256_add_8_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -31909,9 +31952,9 @@ static sp_digit sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_256_add_8_word_%=\n\t" + "BNE L_sp_256_add_8_word\n\t" #else - "BNE.N L_sp_256_add_8_word_%=\n\t" + "BNE.N L_sp_256_add_8_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -33931,7 +33974,7 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x1c\n\t" "\n" - "L_sp_256_cmp_8_words_%=:\n\t" + "L_sp_256_cmp_8_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -33944,7 +33987,7 @@ static sp_int32 sp_256_cmp_8(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_256_cmp_8_words_%=\n\t" + "bcs L_sp_256_cmp_8_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #28]\n\t" @@ -34078,7 +34121,7 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, const sp_digit "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_256_cond_sub_8_words_%=:\n\t" + "L_sp_256_cond_sub_8_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -34089,9 +34132,9 @@ static sp_digit sp_256_cond_sub_8(sp_digit* r, const sp_digit* a, const sp_digit "ADD r5, r5, #0x4\n\t" "CMP r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_256_cond_sub_8_words_%=\n\t" + "BLT L_sp_256_cond_sub_8_words\n\t" #else - "BLT.N L_sp_256_cond_sub_8_words_%=\n\t" + "BLT.N L_sp_256_cond_sub_8_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -34192,7 +34235,7 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_256_mont_reduce_8_word_%=:\n\t" + "L_sp_256_mont_reduce_8_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -34263,9 +34306,9 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x20\n\t" #ifdef __GNUC__ - "BLT L_sp_256_mont_reduce_8_word_%=\n\t" + "BLT L_sp_256_mont_reduce_8_word\n\t" #else - "BLT.W L_sp_256_mont_reduce_8_word_%=\n\t" + "BLT.W L_sp_256_mont_reduce_8_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -34307,7 +34350,7 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_256_mont_reduce_8_word_%=:\n\t" + "L_sp_256_mont_reduce_8_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -34355,9 +34398,9 @@ static void sp_256_mont_reduce_8(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x20\n\t" #ifdef __GNUC__ - "BLT L_sp_256_mont_reduce_8_word_%=\n\t" + "BLT L_sp_256_mont_reduce_8_word\n\t" #else - "BLT.W L_sp_256_mont_reduce_8_word_%=\n\t" + "BLT.W L_sp_256_mont_reduce_8_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -34566,7 +34609,7 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_256_mont_reduce_order_8_word_%=:\n\t" + "L_sp_256_mont_reduce_order_8_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -34637,9 +34680,9 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x20\n\t" #ifdef __GNUC__ - "BLT L_sp_256_mont_reduce_order_8_word_%=\n\t" + "BLT L_sp_256_mont_reduce_order_8_word\n\t" #else - "BLT.W L_sp_256_mont_reduce_order_8_word_%=\n\t" + "BLT.W L_sp_256_mont_reduce_order_8_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -34681,7 +34724,7 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_256_mont_reduce_order_8_word_%=:\n\t" + "L_sp_256_mont_reduce_order_8_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -34729,9 +34772,9 @@ static void sp_256_mont_reduce_order_8(sp_digit* a, const sp_digit* m, sp_digit "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x20\n\t" #ifdef __GNUC__ - "BLT L_sp_256_mont_reduce_order_8_word_%=\n\t" + "BLT L_sp_256_mont_reduce_order_8_word\n\t" #else - "BLT.W L_sp_256_mont_reduce_order_8_word_%=\n\t" + "BLT.W L_sp_256_mont_reduce_order_8_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -39060,7 +39103,7 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x20\n\t" "\n" - "L_sp_256_sub_in_pkace_8_word_%=:\n\t" + "L_sp_256_sub_in_pkace_8_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -39072,9 +39115,9 @@ static sp_digit sp_256_sub_in_place_8(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_256_sub_in_pkace_8_word_%=\n\t" + "BNE L_sp_256_sub_in_pkace_8_word\n\t" #else - "BNE.N L_sp_256_sub_in_pkace_8_word_%=\n\t" + "BNE.N L_sp_256_sub_in_pkace_8_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -39153,7 +39196,7 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_256_mul_d_8_word_%=:\n\t" + "L_sp_256_mul_d_8_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -39167,9 +39210,9 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_256_mul_d_8_word_%=\n\t" + "BLT L_sp_256_mul_d_8_word\n\t" #else - "BLT.N L_sp_256_mul_d_8_word_%=\n\t" + "BLT.N L_sp_256_mul_d_8_word\n\t" #endif "STR r3, [%[r], #32]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -39256,9 +39299,9 @@ static void sp_256_mul_d_8(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -39321,9 +39364,9 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -39347,7 +39390,7 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_256_word_8_bit_%=:\n\t" + "L_div_256_word_8_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -39357,7 +39400,7 @@ static sp_digit div_256_word_8(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_256_word_8_bit_%=\n\t" + "bpl L_div_256_word_8_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -40051,7 +40094,7 @@ static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x20\n\t" "\n" - "L_sp_256_sub_8_word_%=:\n\t" + "L_sp_256_sub_8_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -40063,9 +40106,9 @@ static sp_digit sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_256_sub_8_word_%=\n\t" + "BNE L_sp_256_sub_8_word\n\t" #else - "BNE.N L_sp_256_sub_8_word_%=\n\t" + "BNE.N L_sp_256_sub_8_word\n\t" #endif "MOV %[r], r11\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -40185,9 +40228,9 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m) "LDM %[a]!, {r4}\n\t" "ANDS r3, r4, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_div2_mod_8_even_%=\n\t" + "BEQ L_sp_256_div2_mod_8_even\n\t" #else - "BEQ.N L_sp_256_div2_mod_8_even_%=\n\t" + "BEQ.N L_sp_256_div2_mod_8_even\n\t" #endif "LDM %[a]!, {r5, r6, r7}\n\t" "LDM %[m]!, {r8, r9, r10, r11}\n\t" @@ -40203,13 +40246,17 @@ static void sp_256_div2_mod_8(sp_digit* r, const sp_digit* a, const sp_digit* m) "ADCS r6, r6, r10\n\t" "ADCS r7, r7, r11\n\t" "ADC r3, r12, r12\n\t" - "B L_sp_256_div2_mod_8_div2_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_div2_mod_8_div2\n\t" +#else + "B.N L_sp_256_div2_mod_8_div2\n\t" +#endif "\n" - "L_sp_256_div2_mod_8_even_%=:\n\t" + "L_sp_256_div2_mod_8_even:\n\t" "LDRD r4, r5, [%[a], #12]\n\t" "LDRD r6, r7, [%[a], #20]\n\t" "\n" - "L_sp_256_div2_mod_8_div2_%=:\n\t" + "L_sp_256_div2_mod_8_div2:\n\t" "LSR r8, r4, #1\n\t" "AND r4, r4, #0x1\n\t" "LSR r9, r5, #1\n\t" @@ -40252,100 +40299,128 @@ static int sp_256_num_bits_8(const sp_digit* a) "LDR r1, [%[a], #28]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_7_%=\n\t" + "BEQ L_sp_256_num_bits_8_7\n\t" #else - "BEQ.N L_sp_256_num_bits_8_7_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_7\n\t" #endif "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_7_%=:\n\t" + "L_sp_256_num_bits_8_7:\n\t" "LDR r1, [%[a], #24]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_6_%=\n\t" + "BEQ L_sp_256_num_bits_8_6\n\t" #else - "BEQ.N L_sp_256_num_bits_8_6_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_6\n\t" #endif "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_6_%=:\n\t" + "L_sp_256_num_bits_8_6:\n\t" "LDR r1, [%[a], #20]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_5_%=\n\t" + "BEQ L_sp_256_num_bits_8_5\n\t" #else - "BEQ.N L_sp_256_num_bits_8_5_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_5\n\t" #endif "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_5_%=:\n\t" + "L_sp_256_num_bits_8_5:\n\t" "LDR r1, [%[a], #16]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_4_%=\n\t" + "BEQ L_sp_256_num_bits_8_4\n\t" #else - "BEQ.N L_sp_256_num_bits_8_4_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_4\n\t" #endif "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_4_%=:\n\t" + "L_sp_256_num_bits_8_4:\n\t" "LDR r1, [%[a], #12]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_3_%=\n\t" + "BEQ L_sp_256_num_bits_8_3\n\t" #else - "BEQ.N L_sp_256_num_bits_8_3_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_3\n\t" #endif "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_3_%=:\n\t" + "L_sp_256_num_bits_8_3:\n\t" "LDR r1, [%[a], #8]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_2_%=\n\t" + "BEQ L_sp_256_num_bits_8_2\n\t" #else - "BEQ.N L_sp_256_num_bits_8_2_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_2\n\t" #endif "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_2_%=:\n\t" + "L_sp_256_num_bits_8_2:\n\t" "LDR r1, [%[a], #4]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_256_num_bits_8_1_%=\n\t" + "BEQ L_sp_256_num_bits_8_1\n\t" #else - "BEQ.N L_sp_256_num_bits_8_1_%=\n\t" + "BEQ.N L_sp_256_num_bits_8_1\n\t" #endif "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_256_num_bits_8_9_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_256_num_bits_8_9\n\t" +#else + "B.N L_sp_256_num_bits_8_9\n\t" +#endif "\n" - "L_sp_256_num_bits_8_1_%=:\n\t" + "L_sp_256_num_bits_8_1:\n\t" "LDR r1, [%[a]]\n\t" "MOV r2, #0x20\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" "\n" - "L_sp_256_num_bits_8_9_%=:\n\t" + "L_sp_256_num_bits_8_9:\n\t" "MOV %[a], r4\n\t" : [a] "+r" (a) : @@ -41460,61 +41535,80 @@ static void sp_384_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x60\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_384_mul_12_outer_%=:\n\t" + "L_sp_384_mul_12_outer:\n\t" "SUBS r3, r5, #0x2c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_384_mul_12_inner_%=:\n\t" + "L_sp_384_mul_12_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x30\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_mul_12_inner_done_%=\n\t" + "BGT L_sp_384_mul_12_inner_done\n\t" #else - "BEQ.N L_sp_384_mul_12_inner_done_%=\n\t" + "BGT.N L_sp_384_mul_12_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_mul_12_inner_%=\n\t" + "BLT L_sp_384_mul_12_inner\n\t" #else - "BLE.N L_sp_384_mul_12_inner_%=\n\t" + "BLT.N L_sp_384_mul_12_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_384_mul_12_inner_done_%=:\n\t" + "L_sp_384_mul_12_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x58\n\t" + "CMP r5, #0x54\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_mul_12_outer_%=\n\t" + "BLE L_sp_384_mul_12_outer\n\t" #else - "BLE.N L_sp_384_mul_12_outer_%=\n\t" + "BLE.N L_sp_384_mul_12_outer\n\t" #endif + "LDR lr, [%[a], #44]\n\t" + "LDR r11, [%[b], #44]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_384_mul_12_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_384_mul_12_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_384_mul_12_store_%=\n\t" + "BGT L_sp_384_mul_12_store\n\t" #else - "BGT.N L_sp_384_mul_12_store_%=\n\t" + "BGT.N L_sp_384_mul_12_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -42570,24 +42664,20 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x60\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_384_sqr_12_outer_%=:\n\t" + "L_sp_384_sqr_12_outer:\n\t" "SUBS r3, r5, #0x2c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_384_sqr_12_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_sqr_12_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_384_sqr_12_op_sqr_%=\n\t" -#endif + "L_sp_384_sqr_12_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -42597,59 +42687,51 @@ static void sp_384_sqr_12(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_384_sqr_12_op_done_%=\n\t" - "\n" - "L_sp_384_sqr_12_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_384_sqr_12_inner_done\n\t" +#else + "BGT.N L_sp_384_sqr_12_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_384_sqr_12_inner\n\t" +#else + "BLT.N L_sp_384_sqr_12_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_384_sqr_12_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x30\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_sqr_12_inner_done_%=\n\t" -#else - "BEQ.N L_sp_384_sqr_12_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_384_sqr_12_inner_done_%=\n\t" -#else - "BGT.N L_sp_384_sqr_12_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_sqr_12_inner_%=\n\t" -#else - "BLE.N L_sp_384_sqr_12_inner_%=\n\t" -#endif - "\n" - "L_sp_384_sqr_12_inner_done_%=:\n\t" + "L_sp_384_sqr_12_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x58\n\t" + "CMP r5, #0x54\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_384_sqr_12_outer_%=\n\t" + "BLE L_sp_384_sqr_12_outer\n\t" #else - "BLE.N L_sp_384_sqr_12_outer_%=\n\t" + "BLE.N L_sp_384_sqr_12_outer\n\t" #endif + "LDR lr, [%[a], #44]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_384_sqr_12_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_384_sqr_12_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_384_sqr_12_store_%=\n\t" + "BGT L_sp_384_sqr_12_store\n\t" #else - "BGT.N L_sp_384_sqr_12_store_%=\n\t" + "BGT.N L_sp_384_sqr_12_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -43382,7 +43464,7 @@ static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x30\n\t" "\n" - "L_sp_384_add_12_word_%=:\n\t" + "L_sp_384_add_12_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -43395,9 +43477,9 @@ static sp_digit sp_384_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_384_add_12_word_%=\n\t" + "BNE L_sp_384_add_12_word\n\t" #else - "BNE.N L_sp_384_add_12_word_%=\n\t" + "BNE.N L_sp_384_add_12_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -43782,7 +43864,7 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a, const sp_digi "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_384_cond_sub_12_words_%=:\n\t" + "L_sp_384_cond_sub_12_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -43793,9 +43875,9 @@ static sp_digit sp_384_cond_sub_12(sp_digit* r, const sp_digit* a, const sp_digi "ADD r5, r5, #0x4\n\t" "CMP r5, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_384_cond_sub_12_words_%=\n\t" + "BLT L_sp_384_cond_sub_12_words\n\t" #else - "BLT.N L_sp_384_cond_sub_12_words_%=\n\t" + "BLT.N L_sp_384_cond_sub_12_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -43909,7 +43991,7 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_384_mont_reduce_12_word_%=:\n\t" + "L_sp_384_mont_reduce_12_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -44012,9 +44094,9 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x30\n\t" #ifdef __GNUC__ - "BLT L_sp_384_mont_reduce_12_word_%=\n\t" + "BLT L_sp_384_mont_reduce_12_word\n\t" #else - "BLT.W L_sp_384_mont_reduce_12_word_%=\n\t" + "BLT.W L_sp_384_mont_reduce_12_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -44056,7 +44138,7 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_384_mont_reduce_12_word_%=:\n\t" + "L_sp_384_mont_reduce_12_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -44124,9 +44206,9 @@ static void sp_384_mont_reduce_12(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x30\n\t" #ifdef __GNUC__ - "BLT L_sp_384_mont_reduce_12_word_%=\n\t" + "BLT L_sp_384_mont_reduce_12_word\n\t" #else - "BLT.W L_sp_384_mont_reduce_12_word_%=\n\t" + "BLT.W L_sp_384_mont_reduce_12_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -44311,7 +44393,7 @@ static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x2c\n\t" "\n" - "L_sp_384_cmp_12_words_%=:\n\t" + "L_sp_384_cmp_12_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -44324,7 +44406,7 @@ static sp_int32 sp_384_cmp_12(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_384_cmp_12_words_%=\n\t" + "bcs L_sp_384_cmp_12_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #44]\n\t" @@ -44614,7 +44696,7 @@ static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x30\n\t" "\n" - "L_sp_384_sub_12_word_%=:\n\t" + "L_sp_384_sub_12_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -44626,9 +44708,9 @@ static sp_digit sp_384_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b) "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_384_sub_12_word_%=\n\t" + "BNE L_sp_384_sub_12_word\n\t" #else - "BNE.N L_sp_384_sub_12_word_%=\n\t" + "BNE.N L_sp_384_sub_12_word\n\t" #endif "MOV %[r], r11\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -44715,7 +44797,7 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi "MOV r8, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_384_cond_add_12_words_%=:\n\t" + "L_sp_384_cond_add_12_words:\n\t" "ADDS r5, r5, #0xffffffff\n\t" "LDR r6, [%[a], r4]\n\t" "LDR r7, [%[b], r4]\n\t" @@ -44726,9 +44808,9 @@ static sp_digit sp_384_cond_add_12(sp_digit* r, const sp_digit* a, const sp_digi "ADD r4, r4, #0x4\n\t" "CMP r4, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_384_cond_add_12_words_%=\n\t" + "BLT L_sp_384_cond_add_12_words\n\t" #else - "BLT.N L_sp_384_cond_add_12_words_%=\n\t" + "BLT.N L_sp_384_cond_add_12_words\n\t" #endif "MOV %[r], r5\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -48912,7 +48994,7 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x30\n\t" "\n" - "L_sp_384_sub_in_pkace_12_word_%=:\n\t" + "L_sp_384_sub_in_pkace_12_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -48924,9 +49006,9 @@ static sp_digit sp_384_sub_in_place_12(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_384_sub_in_pkace_12_word_%=\n\t" + "BNE L_sp_384_sub_in_pkace_12_word\n\t" #else - "BNE.N L_sp_384_sub_in_pkace_12_word_%=\n\t" + "BNE.N L_sp_384_sub_in_pkace_12_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -49012,7 +49094,7 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_384_mul_d_12_word_%=:\n\t" + "L_sp_384_mul_d_12_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -49026,9 +49108,9 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x30\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_384_mul_d_12_word_%=\n\t" + "BLT L_sp_384_mul_d_12_word\n\t" #else - "BLT.N L_sp_384_mul_d_12_word_%=\n\t" + "BLT.N L_sp_384_mul_d_12_word\n\t" #endif "STR r3, [%[r], #48]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -49135,9 +49217,9 @@ static void sp_384_mul_d_12(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -49200,9 +49282,9 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -49226,7 +49308,7 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_384_word_12_bit_%=:\n\t" + "L_div_384_word_12_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -49236,7 +49318,7 @@ static sp_digit div_384_word_12(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_384_word_12_bit_%=\n\t" + "bpl L_div_384_word_12_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -49900,9 +49982,9 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m "LDM %[a]!, {r4}\n\t" "ANDS r3, r4, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_div2_mod_12_even_%=\n\t" + "BEQ L_sp_384_div2_mod_12_even\n\t" #else - "BEQ.N L_sp_384_div2_mod_12_even_%=\n\t" + "BEQ.N L_sp_384_div2_mod_12_even\n\t" #endif "MOV r12, #0x0\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" @@ -49927,9 +50009,13 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m "ADCS r7, r7, r11\n\t" "STM %[r]!, {r4, r5, r6, r7}\n\t" "ADC r3, r12, r12\n\t" - "B L_sp_384_div2_mod_12_div2_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_div2_mod_12_div2\n\t" +#else + "B.N L_sp_384_div2_mod_12_div2\n\t" +#endif "\n" - "L_sp_384_div2_mod_12_even_%=:\n\t" + "L_sp_384_div2_mod_12_even:\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" "STM %[r]!, {r4, r5, r6, r7}\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" @@ -49937,7 +50023,7 @@ static void sp_384_div2_mod_12(sp_digit* r, const sp_digit* a, const sp_digit* m "LDM %[a]!, {r4, r5, r6, r7}\n\t" "STM %[r]!, {r4, r5, r6, r7}\n\t" "\n" - "L_sp_384_div2_mod_12_div2_%=:\n\t" + "L_sp_384_div2_mod_12_div2:\n\t" "SUB %[r], %[r], #0x30\n\t" "LDRD r8, r9, [%[r]]\n\t" "LSR r8, r8, #1\n\t" @@ -50006,152 +50092,196 @@ static int sp_384_num_bits_12(const sp_digit* a) "LDR r1, [%[a], #44]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_11_%=\n\t" + "BEQ L_sp_384_num_bits_12_11\n\t" #else - "BEQ.N L_sp_384_num_bits_12_11_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_11\n\t" #endif "MOV r2, #0x180\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_11_%=:\n\t" + "L_sp_384_num_bits_12_11:\n\t" "LDR r1, [%[a], #40]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_10_%=\n\t" + "BEQ L_sp_384_num_bits_12_10\n\t" #else - "BEQ.N L_sp_384_num_bits_12_10_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_10\n\t" #endif "MOV r2, #0x160\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_10_%=:\n\t" + "L_sp_384_num_bits_12_10:\n\t" "LDR r1, [%[a], #36]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_9_%=\n\t" + "BEQ L_sp_384_num_bits_12_9\n\t" #else - "BEQ.N L_sp_384_num_bits_12_9_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_9\n\t" #endif "MOV r2, #0x140\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_9_%=:\n\t" + "L_sp_384_num_bits_12_9:\n\t" "LDR r1, [%[a], #32]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_8_%=\n\t" + "BEQ L_sp_384_num_bits_12_8\n\t" #else - "BEQ.N L_sp_384_num_bits_12_8_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_8\n\t" #endif "MOV r2, #0x120\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_8_%=:\n\t" + "L_sp_384_num_bits_12_8:\n\t" "LDR r1, [%[a], #28]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_7_%=\n\t" + "BEQ L_sp_384_num_bits_12_7\n\t" #else - "BEQ.N L_sp_384_num_bits_12_7_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_7\n\t" #endif "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_7_%=:\n\t" + "L_sp_384_num_bits_12_7:\n\t" "LDR r1, [%[a], #24]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_6_%=\n\t" + "BEQ L_sp_384_num_bits_12_6\n\t" #else - "BEQ.N L_sp_384_num_bits_12_6_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_6\n\t" #endif "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_6_%=:\n\t" + "L_sp_384_num_bits_12_6:\n\t" "LDR r1, [%[a], #20]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_5_%=\n\t" + "BEQ L_sp_384_num_bits_12_5\n\t" #else - "BEQ.N L_sp_384_num_bits_12_5_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_5\n\t" #endif "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_5_%=:\n\t" + "L_sp_384_num_bits_12_5:\n\t" "LDR r1, [%[a], #16]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_4_%=\n\t" + "BEQ L_sp_384_num_bits_12_4\n\t" #else - "BEQ.N L_sp_384_num_bits_12_4_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_4\n\t" #endif "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_4_%=:\n\t" + "L_sp_384_num_bits_12_4:\n\t" "LDR r1, [%[a], #12]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_3_%=\n\t" + "BEQ L_sp_384_num_bits_12_3\n\t" #else - "BEQ.N L_sp_384_num_bits_12_3_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_3\n\t" #endif "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_3_%=:\n\t" + "L_sp_384_num_bits_12_3:\n\t" "LDR r1, [%[a], #8]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_2_%=\n\t" + "BEQ L_sp_384_num_bits_12_2\n\t" #else - "BEQ.N L_sp_384_num_bits_12_2_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_2\n\t" #endif "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_2_%=:\n\t" + "L_sp_384_num_bits_12_2:\n\t" "LDR r1, [%[a], #4]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_384_num_bits_12_1_%=\n\t" + "BEQ L_sp_384_num_bits_12_1\n\t" #else - "BEQ.N L_sp_384_num_bits_12_1_%=\n\t" + "BEQ.N L_sp_384_num_bits_12_1\n\t" #endif "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_384_num_bits_12_13_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_384_num_bits_12_13\n\t" +#else + "B.N L_sp_384_num_bits_12_13\n\t" +#endif "\n" - "L_sp_384_num_bits_12_1_%=:\n\t" + "L_sp_384_num_bits_12_1:\n\t" "LDR r1, [%[a]]\n\t" "MOV r2, #0x20\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" "\n" - "L_sp_384_num_bits_12_13_%=:\n\t" + "L_sp_384_num_bits_12_13:\n\t" "MOV %[a], r4\n\t" : [a] "+r" (a) : @@ -51312,64 +51442,83 @@ static void sp_521_mul_17(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x88\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_521_mul_17_outer_%=:\n\t" + "L_sp_521_mul_17_outer:\n\t" "SUBS r3, r5, #0x40\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_521_mul_17_inner_%=:\n\t" + "L_sp_521_mul_17_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x44\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_mul_17_inner_done_%=\n\t" + "BGT L_sp_521_mul_17_inner_done\n\t" #else - "BEQ.N L_sp_521_mul_17_inner_done_%=\n\t" + "BGT.N L_sp_521_mul_17_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_mul_17_inner_%=\n\t" + "BLT L_sp_521_mul_17_inner\n\t" #else - "BLE.N L_sp_521_mul_17_inner_%=\n\t" + "BLT.N L_sp_521_mul_17_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_521_mul_17_inner_done_%=:\n\t" + "L_sp_521_mul_17_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x80\n\t" + "CMP r5, #0x7c\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_mul_17_outer_%=\n\t" + "BLE L_sp_521_mul_17_outer\n\t" #else - "BLE.N L_sp_521_mul_17_outer_%=\n\t" + "BLE.N L_sp_521_mul_17_outer\n\t" #endif + "LDR lr, [%[a], #64]\n\t" + "LDR r11, [%[b], #64]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "LDM sp!, {r6, r7}\n\t" "STM %[r]!, {r6, r7}\n\t" "SUB r5, r5, #0x8\n\t" "\n" - "L_sp_521_mul_17_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_521_mul_17_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_521_mul_17_store_%=\n\t" + "BGT L_sp_521_mul_17_store\n\t" #else - "BGT.N L_sp_521_mul_17_store_%=\n\t" + "BGT.N L_sp_521_mul_17_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -53439,24 +53588,20 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x88\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_521_sqr_17_outer_%=:\n\t" + "L_sp_521_sqr_17_outer:\n\t" "SUBS r3, r5, #0x40\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_521_sqr_17_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_sqr_17_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_521_sqr_17_op_sqr_%=\n\t" -#endif + "L_sp_521_sqr_17_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -53466,62 +53611,54 @@ static void sp_521_sqr_17(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_521_sqr_17_op_done_%=\n\t" - "\n" - "L_sp_521_sqr_17_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_521_sqr_17_inner_done\n\t" +#else + "BGT.N L_sp_521_sqr_17_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_521_sqr_17_inner\n\t" +#else + "BLT.N L_sp_521_sqr_17_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_521_sqr_17_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x44\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_sqr_17_inner_done_%=\n\t" -#else - "BEQ.N L_sp_521_sqr_17_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_521_sqr_17_inner_done_%=\n\t" -#else - "BGT.N L_sp_521_sqr_17_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_sqr_17_inner_%=\n\t" -#else - "BLE.N L_sp_521_sqr_17_inner_%=\n\t" -#endif - "\n" - "L_sp_521_sqr_17_inner_done_%=:\n\t" + "L_sp_521_sqr_17_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0x80\n\t" + "CMP r5, #0x7c\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_521_sqr_17_outer_%=\n\t" + "BLE L_sp_521_sqr_17_outer\n\t" #else - "BLE.N L_sp_521_sqr_17_outer_%=\n\t" + "BLE.N L_sp_521_sqr_17_outer\n\t" #endif + "LDR lr, [%[a], #64]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "LDM sp!, {r6, r7}\n\t" "STM %[r]!, {r6, r7}\n\t" "SUB r5, r5, #0x8\n\t" "\n" - "L_sp_521_sqr_17_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_521_sqr_17_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_521_sqr_17_store_%=\n\t" + "BGT L_sp_521_sqr_17_store\n\t" #else - "BGT.N L_sp_521_sqr_17_store_%=\n\t" + "BGT.N L_sp_521_sqr_17_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -54838,7 +54975,7 @@ static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x40\n\t" "\n" - "L_sp_521_add_17_word_%=:\n\t" + "L_sp_521_add_17_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -54851,9 +54988,9 @@ static sp_digit sp_521_add_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_521_add_17_word_%=\n\t" + "BNE L_sp_521_add_17_word\n\t" #else - "BNE.N L_sp_521_add_17_word_%=\n\t" + "BNE.N L_sp_521_add_17_word\n\t" #endif "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a], {r4}\n\t" @@ -55171,7 +55308,7 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a, const sp_digi "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_521_cond_sub_17_words_%=:\n\t" + "L_sp_521_cond_sub_17_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -55182,9 +55319,9 @@ static sp_digit sp_521_cond_sub_17(sp_digit* r, const sp_digit* a, const sp_digi "ADD r5, r5, #0x4\n\t" "CMP r5, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_521_cond_sub_17_words_%=\n\t" + "BLT L_sp_521_cond_sub_17_words\n\t" #else - "BLT.N L_sp_521_cond_sub_17_words_%=\n\t" + "BLT.N L_sp_521_cond_sub_17_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -55451,19 +55588,19 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_521_mont_reduce_order_17_word_%=:\n\t" + "L_sp_521_mont_reduce_order_17_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" "CMP r11, #0x40\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_521_mont_reduce_order_17_nomask_%=\n\t" + "BNE L_sp_521_mont_reduce_order_17_nomask\n\t" #else - "BNE.N L_sp_521_mont_reduce_order_17_nomask_%=\n\t" + "BNE.N L_sp_521_mont_reduce_order_17_nomask\n\t" #endif "MOV r9, #0x1ff\n\t" "AND r10, r10, r9\n\t" "\n" - "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t" + "L_sp_521_mont_reduce_order_17_nomask:\n\t" /* a[i+0] += m[0] * mu */ "MOV r7, #0x0\n\t" "UMLAL r4, r7, r10, lr\n\t" @@ -55605,9 +55742,9 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x44\n\t" #ifdef __GNUC__ - "BLT L_sp_521_mont_reduce_order_17_word_%=\n\t" + "BLT L_sp_521_mont_reduce_order_17_word\n\t" #else - "BLT.W L_sp_521_mont_reduce_order_17_word_%=\n\t" + "BLT.W L_sp_521_mont_reduce_order_17_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -55719,19 +55856,19 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_521_mont_reduce_order_17_word_%=:\n\t" + "L_sp_521_mont_reduce_order_17_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" "CMP r4, #0x40\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_521_mont_reduce_order_17_nomask_%=\n\t" + "BNE L_sp_521_mont_reduce_order_17_nomask\n\t" #else - "BNE.N L_sp_521_mont_reduce_order_17_nomask_%=\n\t" + "BNE.N L_sp_521_mont_reduce_order_17_nomask\n\t" #endif "MOV r12, #0x1ff\n\t" "AND lr, lr, r12\n\t" "\n" - "L_sp_521_mont_reduce_order_17_nomask_%=:\n\t" + "L_sp_521_mont_reduce_order_17_nomask:\n\t" /* a[i+0] += m[0] * mu */ "LDR r12, [%[m]]\n\t" "MOV r3, #0x0\n\t" @@ -55823,9 +55960,9 @@ static void sp_521_mont_reduce_order_17(sp_digit* a, const sp_digit* m, sp_digit "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x44\n\t" #ifdef __GNUC__ - "BLT L_sp_521_mont_reduce_order_17_word_%=\n\t" + "BLT L_sp_521_mont_reduce_order_17_word\n\t" #else - "BLT.W L_sp_521_mont_reduce_order_17_word_%=\n\t" + "BLT.W L_sp_521_mont_reduce_order_17_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -56077,7 +56214,7 @@ static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x40\n\t" "\n" - "L_sp_521_cmp_17_words_%=:\n\t" + "L_sp_521_cmp_17_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -56090,7 +56227,7 @@ static sp_int32 sp_521_cmp_17(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_521_cmp_17_words_%=\n\t" + "bcs L_sp_521_cmp_17_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #64]\n\t" @@ -61870,7 +62007,7 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x40\n\t" "\n" - "L_sp_521_sub_in_pkace_17_word_%=:\n\t" + "L_sp_521_sub_in_pkace_17_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -61882,9 +62019,9 @@ static sp_digit sp_521_sub_in_place_17(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_521_sub_in_pkace_17_word_%=\n\t" + "BNE L_sp_521_sub_in_pkace_17_word\n\t" #else - "BNE.N L_sp_521_sub_in_pkace_17_word_%=\n\t" + "BNE.N L_sp_521_sub_in_pkace_17_word\n\t" #endif "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2}\n\t" @@ -61986,7 +62123,7 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_521_mul_d_17_word_%=:\n\t" + "L_sp_521_mul_d_17_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -62000,9 +62137,9 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x44\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_521_mul_d_17_word_%=\n\t" + "BLT L_sp_521_mul_d_17_word\n\t" #else - "BLT.N L_sp_521_mul_d_17_word_%=\n\t" + "BLT.N L_sp_521_mul_d_17_word\n\t" #endif "STR r3, [%[r], #68]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -62134,9 +62271,9 @@ static void sp_521_mul_d_17(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -62199,9 +62336,9 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -62225,7 +62362,7 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_521_word_17_bit_%=:\n\t" + "L_div_521_word_17_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -62235,7 +62372,7 @@ static sp_digit div_521_word_17(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_521_word_17_bit_%=\n\t" + "bpl L_div_521_word_17_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -62930,7 +63067,7 @@ static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "MOV r11, #0x0\n\t" "ADD r12, %[a], #0x40\n\t" "\n" - "L_sp_521_sub_17_word_%=:\n\t" + "L_sp_521_sub_17_word:\n\t" "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3, r4, r5, r6}\n\t" "LDM %[b]!, {r7, r8, r9, r10}\n\t" @@ -62942,9 +63079,9 @@ static sp_digit sp_521_sub_17(sp_digit* r, const sp_digit* a, const sp_digit* b) "SBC r11, r3, r3\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_521_sub_17_word_%=\n\t" + "BNE L_sp_521_sub_17_word\n\t" #else - "BNE.N L_sp_521_sub_17_word_%=\n\t" + "BNE.N L_sp_521_sub_17_word\n\t" #endif "RSBS r11, r11, #0x0\n\t" "LDM %[a]!, {r3}\n\t" @@ -63042,9 +63179,9 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m "LDM %[a]!, {r4}\n\t" "ANDS r3, r4, #0x1\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_div2_mod_17_even_%=\n\t" + "BEQ L_sp_521_div2_mod_17_even\n\t" #else - "BEQ.N L_sp_521_div2_mod_17_even_%=\n\t" + "BEQ.N L_sp_521_div2_mod_17_even\n\t" #endif "MOV r12, #0x0\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" @@ -63080,9 +63217,13 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m "ADCS r4, r4, r8\n\t" "STM %[r]!, {r4}\n\t" "ADC r3, r12, r12\n\t" - "B L_sp_521_div2_mod_17_div2_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_div2_mod_17_div2\n\t" +#else + "B.N L_sp_521_div2_mod_17_div2\n\t" +#endif "\n" - "L_sp_521_div2_mod_17_even_%=:\n\t" + "L_sp_521_div2_mod_17_even:\n\t" "LDM %[a]!, {r5, r6, r7}\n\t" "STM %[r]!, {r4, r5, r6, r7}\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" @@ -63094,7 +63235,7 @@ static void sp_521_div2_mod_17(sp_digit* r, const sp_digit* a, const sp_digit* m "LDM %[a]!, {r4}\n\t" "STM %[r]!, {r4}\n\t" "\n" - "L_sp_521_div2_mod_17_div2_%=:\n\t" + "L_sp_521_div2_mod_17_div2:\n\t" "SUB %[r], %[r], #0x44\n\t" "LDRD r8, r9, [%[r]]\n\t" "LSR r8, r8, #1\n\t" @@ -63183,217 +63324,281 @@ static int sp_521_num_bits_17(const sp_digit* a) "LDR r1, [%[a], #64]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_16_%=\n\t" + "BEQ L_sp_521_num_bits_17_16\n\t" #else - "BEQ.N L_sp_521_num_bits_17_16_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_16\n\t" #endif "MOV r2, #0x220\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_16_%=:\n\t" + "L_sp_521_num_bits_17_16:\n\t" "LDR r1, [%[a], #60]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_15_%=\n\t" + "BEQ L_sp_521_num_bits_17_15\n\t" #else - "BEQ.N L_sp_521_num_bits_17_15_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_15\n\t" #endif "MOV r2, #0x200\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_15_%=:\n\t" + "L_sp_521_num_bits_17_15:\n\t" "LDR r1, [%[a], #56]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_14_%=\n\t" + "BEQ L_sp_521_num_bits_17_14\n\t" #else - "BEQ.N L_sp_521_num_bits_17_14_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_14\n\t" #endif "MOV r2, #0x1e0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_14_%=:\n\t" + "L_sp_521_num_bits_17_14:\n\t" "LDR r1, [%[a], #52]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_13_%=\n\t" + "BEQ L_sp_521_num_bits_17_13\n\t" #else - "BEQ.N L_sp_521_num_bits_17_13_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_13\n\t" #endif "MOV r2, #0x1c0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_13_%=:\n\t" + "L_sp_521_num_bits_17_13:\n\t" "LDR r1, [%[a], #48]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_12_%=\n\t" + "BEQ L_sp_521_num_bits_17_12\n\t" #else - "BEQ.N L_sp_521_num_bits_17_12_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_12\n\t" #endif "MOV r2, #0x1a0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_12_%=:\n\t" + "L_sp_521_num_bits_17_12:\n\t" "LDR r1, [%[a], #44]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_11_%=\n\t" + "BEQ L_sp_521_num_bits_17_11\n\t" #else - "BEQ.N L_sp_521_num_bits_17_11_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_11\n\t" #endif "MOV r2, #0x180\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_11_%=:\n\t" + "L_sp_521_num_bits_17_11:\n\t" "LDR r1, [%[a], #40]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_10_%=\n\t" + "BEQ L_sp_521_num_bits_17_10\n\t" #else - "BEQ.N L_sp_521_num_bits_17_10_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_10\n\t" #endif "MOV r2, #0x160\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_10_%=:\n\t" + "L_sp_521_num_bits_17_10:\n\t" "LDR r1, [%[a], #36]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_9_%=\n\t" + "BEQ L_sp_521_num_bits_17_9\n\t" #else - "BEQ.N L_sp_521_num_bits_17_9_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_9\n\t" #endif "MOV r2, #0x140\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_9_%=:\n\t" + "L_sp_521_num_bits_17_9:\n\t" "LDR r1, [%[a], #32]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_8_%=\n\t" + "BEQ L_sp_521_num_bits_17_8\n\t" #else - "BEQ.N L_sp_521_num_bits_17_8_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_8\n\t" #endif "MOV r2, #0x120\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_8_%=:\n\t" + "L_sp_521_num_bits_17_8:\n\t" "LDR r1, [%[a], #28]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_7_%=\n\t" + "BEQ L_sp_521_num_bits_17_7\n\t" #else - "BEQ.N L_sp_521_num_bits_17_7_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_7\n\t" #endif "MOV r2, #0x100\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_7_%=:\n\t" + "L_sp_521_num_bits_17_7:\n\t" "LDR r1, [%[a], #24]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_6_%=\n\t" + "BEQ L_sp_521_num_bits_17_6\n\t" #else - "BEQ.N L_sp_521_num_bits_17_6_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_6\n\t" #endif "MOV r2, #0xe0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_6_%=:\n\t" + "L_sp_521_num_bits_17_6:\n\t" "LDR r1, [%[a], #20]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_5_%=\n\t" + "BEQ L_sp_521_num_bits_17_5\n\t" #else - "BEQ.N L_sp_521_num_bits_17_5_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_5\n\t" #endif "MOV r2, #0xc0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_5_%=:\n\t" + "L_sp_521_num_bits_17_5:\n\t" "LDR r1, [%[a], #16]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_4_%=\n\t" + "BEQ L_sp_521_num_bits_17_4\n\t" #else - "BEQ.N L_sp_521_num_bits_17_4_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_4\n\t" #endif "MOV r2, #0xa0\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_4_%=:\n\t" + "L_sp_521_num_bits_17_4:\n\t" "LDR r1, [%[a], #12]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_3_%=\n\t" + "BEQ L_sp_521_num_bits_17_3\n\t" #else - "BEQ.N L_sp_521_num_bits_17_3_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_3\n\t" #endif "MOV r2, #0x80\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_3_%=:\n\t" + "L_sp_521_num_bits_17_3:\n\t" "LDR r1, [%[a], #8]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_2_%=\n\t" + "BEQ L_sp_521_num_bits_17_2\n\t" #else - "BEQ.N L_sp_521_num_bits_17_2_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_2\n\t" #endif "MOV r2, #0x60\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_2_%=:\n\t" + "L_sp_521_num_bits_17_2:\n\t" "LDR r1, [%[a], #4]\n\t" "CMP r1, #0x0\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_521_num_bits_17_1_%=\n\t" + "BEQ L_sp_521_num_bits_17_1\n\t" #else - "BEQ.N L_sp_521_num_bits_17_1_%=\n\t" + "BEQ.N L_sp_521_num_bits_17_1\n\t" #endif "MOV r2, #0x40\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" - "B L_sp_521_num_bits_17_18_%=\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "B L_sp_521_num_bits_17_18\n\t" +#else + "B.N L_sp_521_num_bits_17_18\n\t" +#endif "\n" - "L_sp_521_num_bits_17_1_%=:\n\t" + "L_sp_521_num_bits_17_1:\n\t" "LDR r1, [%[a]]\n\t" "MOV r2, #0x20\n\t" "CLZ r4, r1\n\t" "SUB r4, r2, r4\n\t" "\n" - "L_sp_521_num_bits_17_18_%=:\n\t" + "L_sp_521_num_bits_17_18:\n\t" "MOV %[a], r4\n\t" : [a] "+r" (a) : @@ -67780,61 +67985,80 @@ static void sp_1024_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r5, #0x0\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "LDR r11, [%[b]]\n\t" + "UMULL r8, r6, lr, r11\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_1024_mul_32_outer_%=:\n\t" + "L_sp_1024_mul_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_1024_mul_32_inner_%=:\n\t" + "L_sp_1024_mul_32_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[b], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" + "LDR lr, [%[a], r4]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "ADD r3, r3, #0x4\n\t" "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" + "CMP r3, r4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_mul_32_inner_done_%=\n\t" + "BGT L_sp_1024_mul_32_inner_done\n\t" #else - "BEQ.N L_sp_1024_mul_32_inner_done_%=\n\t" + "BGT.N L_sp_1024_mul_32_inner_done\n\t" #endif - "CMP r3, r5\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_mul_32_inner_%=\n\t" + "BLT L_sp_1024_mul_32_inner\n\t" #else - "BLE.N L_sp_1024_mul_32_inner_%=\n\t" + "BLT.N L_sp_1024_mul_32_inner\n\t" #endif + "LDR lr, [%[a], r3]\n\t" + "LDR r11, [%[b], r3]\n\t" + "UMULL r9, r10, lr, r11\n\t" + "ADDS r6, r6, r9\n\t" + "ADCS r7, r7, r10\n\t" + "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_1024_mul_32_inner_done_%=:\n\t" + "L_sp_1024_mul_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_mul_32_outer_%=\n\t" + "BLE L_sp_1024_mul_32_outer\n\t" #else - "BLE.N L_sp_1024_mul_32_outer_%=\n\t" + "BLE.N L_sp_1024_mul_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "LDR r11, [%[b], #124]\n\t" + "UMLAL r6, r7, lr, r11\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_1024_mul_32_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_1024_mul_32_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_1024_mul_32_store_%=\n\t" + "BGT L_sp_1024_mul_32_store\n\t" #else - "BGT.N L_sp_1024_mul_32_store_%=\n\t" + "BGT.N L_sp_1024_mul_32_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) : @@ -67860,24 +68084,20 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) __asm__ __volatile__ ( "SUB sp, sp, #0x100\n\t" - "MOV r6, #0x0\n\t" + "LDR lr, [%[a]]\n\t" + "UMULL r8, r6, lr, lr\n\t" + "STR r8, [sp]\n\t" "MOV r7, #0x0\n\t" "MOV r8, #0x0\n\t" - "MOV r5, #0x0\n\t" + "MOV r5, #0x4\n\t" "\n" - "L_sp_1024_sqr_32_outer_%=:\n\t" + "L_sp_1024_sqr_32_outer:\n\t" "SUBS r3, r5, #0x7c\n\t" "IT cc\n\t" - "movcc r3, #0\n\t" + "MOVCC r3, #0x0\n\t" "SUB r4, r5, r3\n\t" "\n" - "L_sp_1024_sqr_32_inner_%=:\n\t" - "CMP r4, r3\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_sqr_32_op_sqr_%=\n\t" -#else - "BEQ.N L_sp_1024_sqr_32_op_sqr_%=\n\t" -#endif + "L_sp_1024_sqr_32_inner:\n\t" "LDR lr, [%[a], r3]\n\t" "LDR r11, [%[a], r4]\n\t" "UMULL r9, r10, lr, r11\n\t" @@ -67887,59 +68107,51 @@ static void sp_1024_sqr_32(sp_digit* r, const sp_digit* a) "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" - "bal L_sp_1024_sqr_32_op_done_%=\n\t" - "\n" - "L_sp_1024_sqr_32_op_sqr_%=:\n\t" + "ADD r3, r3, #0x4\n\t" + "SUB r4, r4, #0x4\n\t" + "CMP r3, r4\n\t" +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BGT L_sp_1024_sqr_32_inner_done\n\t" +#else + "BGT.N L_sp_1024_sqr_32_inner_done\n\t" +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + "BLT L_sp_1024_sqr_32_inner\n\t" +#else + "BLT.N L_sp_1024_sqr_32_inner\n\t" +#endif "LDR lr, [%[a], r3]\n\t" "UMULL r9, r10, lr, lr\n\t" "ADDS r6, r6, r9\n\t" "ADCS r7, r7, r10\n\t" "ADC r8, r8, #0x0\n\t" "\n" - "L_sp_1024_sqr_32_op_done_%=:\n\t" - "ADD r3, r3, #0x4\n\t" - "SUB r4, r4, #0x4\n\t" - "CMP r3, #0x80\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BEQ L_sp_1024_sqr_32_inner_done_%=\n\t" -#else - "BEQ.N L_sp_1024_sqr_32_inner_done_%=\n\t" -#endif - "CMP r3, r4\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_1024_sqr_32_inner_done_%=\n\t" -#else - "BGT.N L_sp_1024_sqr_32_inner_done_%=\n\t" -#endif - "CMP r3, r5\n\t" -#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_sqr_32_inner_%=\n\t" -#else - "BLE.N L_sp_1024_sqr_32_inner_%=\n\t" -#endif - "\n" - "L_sp_1024_sqr_32_inner_done_%=:\n\t" + "L_sp_1024_sqr_32_inner_done:\n\t" "STR r6, [sp, r5]\n\t" "MOV r6, r7\n\t" "MOV r7, r8\n\t" "MOV r8, #0x0\n\t" "ADD r5, r5, #0x4\n\t" - "CMP r5, #0xf8\n\t" + "CMP r5, #0xf4\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLE L_sp_1024_sqr_32_outer_%=\n\t" + "BLE L_sp_1024_sqr_32_outer\n\t" #else - "BLE.N L_sp_1024_sqr_32_outer_%=\n\t" + "BLE.N L_sp_1024_sqr_32_outer\n\t" #endif + "LDR lr, [%[a], #124]\n\t" + "UMLAL r6, r7, lr, lr\n\t" "STR r6, [sp, r5]\n\t" + "ADD r5, r5, #0x4\n\t" + "STR r7, [sp, r5]\n\t" "\n" - "L_sp_1024_sqr_32_store_%=:\n\t" - "LDM sp!, {r6, r7, r8, r9}\n\t" - "STM %[r]!, {r6, r7, r8, r9}\n\t" - "SUBS r5, r5, #0x10\n\t" + "L_sp_1024_sqr_32_store:\n\t" + "LDM sp!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "STM %[r]!, {r3, r4, r6, r7, r8, r9, r10, r11}\n\t" + "SUBS r5, r5, #0x20\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BGT L_sp_1024_sqr_32_store_%=\n\t" + "BGT L_sp_1024_sqr_32_store\n\t" #else - "BGT.N L_sp_1024_sqr_32_store_%=\n\t" + "BGT.N L_sp_1024_sqr_32_store\n\t" #endif : [r] "+r" (r), [a] "+r" (a) : @@ -68054,7 +68266,7 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a, const sp_digit* b) "MOV r10, #0x0\n\t" "ADD r11, %[a], #0x80\n\t" "\n" - "L_sp_1024_sub_in_pkace_32_word_%=:\n\t" + "L_sp_1024_sub_in_pkace_32_word:\n\t" "RSBS r10, r10, #0x0\n\t" "LDM %[a], {r2, r3, r4, r5}\n\t" "LDM %[b]!, {r6, r7, r8, r9}\n\t" @@ -68066,9 +68278,9 @@ static sp_digit sp_1024_sub_in_place_32(sp_digit* a, const sp_digit* b) "SBC r10, r10, r10\n\t" "CMP %[a], r11\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_1024_sub_in_pkace_32_word_%=\n\t" + "BNE L_sp_1024_sub_in_pkace_32_word\n\t" #else - "BNE.N L_sp_1024_sub_in_pkace_32_word_%=\n\t" + "BNE.N L_sp_1024_sub_in_pkace_32_word\n\t" #endif "MOV %[a], r10\n\t" : [a] "+r" (a), [b] "+r" (b) @@ -68106,7 +68318,7 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig "MOV r4, #0x0\n\t" "MOV r5, #0x0\n\t" "\n" - "L_sp_1024_cond_sub_32_words_%=:\n\t" + "L_sp_1024_cond_sub_32_words:\n\t" "SUBS r4, r8, r4\n\t" "LDR r6, [%[a], r5]\n\t" "LDR r7, [%[b], r5]\n\t" @@ -68117,9 +68329,9 @@ static sp_digit sp_1024_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_dig "ADD r5, r5, #0x4\n\t" "CMP r5, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_1024_cond_sub_32_words_%=\n\t" + "BLT L_sp_1024_cond_sub_32_words\n\t" #else - "BLT.N L_sp_1024_cond_sub_32_words_%=\n\t" + "BLT.N L_sp_1024_cond_sub_32_words\n\t" #endif "MOV %[r], r4\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) @@ -68297,7 +68509,7 @@ static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b "MOV r3, #0x0\n\t" "ADD r12, %[a], #0x80\n\t" "\n" - "L_sp_1024_add_32_word_%=:\n\t" + "L_sp_1024_add_32_word:\n\t" "ADDS r3, r3, #0xffffffff\n\t" "LDM %[a]!, {r4, r5, r6, r7}\n\t" "LDM %[b]!, {r8, r9, r10, r11}\n\t" @@ -68310,9 +68522,9 @@ static sp_digit sp_1024_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b "ADC r3, r4, #0x0\n\t" "CMP %[a], r12\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BNE L_sp_1024_add_32_word_%=\n\t" + "BNE L_sp_1024_add_32_word\n\t" #else - "BNE.N L_sp_1024_add_32_word_%=\n\t" + "BNE.N L_sp_1024_add_32_word\n\t" #endif "MOV %[r], r3\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -68351,7 +68563,7 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) "MOV r5, #0x0\n\t" "MOV r9, #0x4\n\t" "\n" - "L_sp_1024_mul_d_32_word_%=:\n\t" + "L_sp_1024_mul_d_32_word:\n\t" /* A[i] * B */ "LDR r8, [%[a], r9]\n\t" "UMULL r6, r7, %[b], r8\n\t" @@ -68365,9 +68577,9 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) "ADD r9, r9, #0x4\n\t" "CMP r9, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_1024_mul_d_32_word_%=\n\t" + "BLT L_sp_1024_mul_d_32_word\n\t" #else - "BLT.N L_sp_1024_mul_d_32_word_%=\n\t" + "BLT.N L_sp_1024_mul_d_32_word\n\t" #endif "STR r3, [%[r], #128]\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) @@ -68574,9 +68786,9 @@ static void sp_1024_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -68639,9 +68851,9 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) * Note that this is an approximate div. It may give an answer 1 larger. */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG -static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) +SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1_p, sp_digit d0_p, sp_digit div_p) #else -static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) +SP_NOINLINE static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) #endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ { #ifndef WOLFSSL_NO_VAR_ASSIGN_REG @@ -68665,7 +68877,7 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) /* Next 30 bits */ "MOV r4, #0x1d\n\t" "\n" - "L_div_1024_word_32_bit_%=:\n\t" + "L_div_1024_word_32_bit:\n\t" "LSLS r6, r6, #1\n\t" "ADC r7, r7, r7\n\t" "SUBS r8, r5, r7\n\t" @@ -68675,7 +68887,7 @@ static sp_digit div_1024_word_32(sp_digit d1, sp_digit d0, sp_digit div) "AND r8, r8, r5\n\t" "SUBS r7, r7, r8\n\t" "SUBS r4, r4, #0x1\n\t" - "bpl L_div_1024_word_32_bit_%=\n\t" + "bpl L_div_1024_word_32_bit\n\t" "ADD r3, r3, r3\n\t" "ADD r3, r3, #0x1\n\t" "UMULL r6, r7, r3, %[div]\n\t" @@ -68757,7 +68969,7 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b) #ifdef WOLFSSL_SP_SMALL "MOV r6, #0x7c\n\t" "\n" - "L_sp_1024_cmp_32_words_%=:\n\t" + "L_sp_1024_cmp_32_words:\n\t" "LDR r4, [%[a], r6]\n\t" "LDR r5, [%[b], r6]\n\t" "AND r4, r4, r3\n\t" @@ -68770,7 +68982,7 @@ static sp_int32 sp_1024_cmp_32(const sp_digit* a, const sp_digit* b) "IT ne\n\t" "movne r3, r7\n\t" "SUBS r6, r6, #0x4\n\t" - "bcs L_sp_1024_cmp_32_words_%=\n\t" + "bcs L_sp_1024_cmp_32_words\n\t" "EOR r2, r2, r3\n\t" #else "LDR r4, [%[a], #124]\n\t" @@ -69490,7 +69702,7 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r4, [%[a]]\n\t" "LDR r5, [%[a], #4]\n\t" "\n" - "L_sp_1024_mont_reduce_32_word_%=:\n\t" + "L_sp_1024_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "MUL r10, %[mp], r4\n\t" /* a[i+0] += m[0] * mu */ @@ -69753,9 +69965,9 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r11, #0x80\n\t" #ifdef __GNUC__ - "BLT L_sp_1024_mont_reduce_32_word_%=\n\t" + "BLT L_sp_1024_mont_reduce_32_word\n\t" #else - "BLT.W L_sp_1024_mont_reduce_32_word_%=\n\t" + "BLT.W L_sp_1024_mont_reduce_32_word\n\t" #endif /* Loop Done */ "STR r4, [%[a]]\n\t" @@ -69802,7 +70014,7 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "LDR r9, [%[a], #12]\n\t" "LDR r10, [%[a], #16]\n\t" "\n" - "L_sp_1024_mont_reduce_32_word_%=:\n\t" + "L_sp_1024_mont_reduce_32_word:\n\t" /* mu = a[i] * mp */ "MUL lr, %[mp], r6\n\t" /* a[i+0] += m[0] * mu */ @@ -69970,9 +70182,9 @@ static void sp_1024_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp) "ADD %[a], %[a], #0x4\n\t" "CMP r4, #0x80\n\t" #ifdef __GNUC__ - "BLT L_sp_1024_mont_reduce_32_word_%=\n\t" + "BLT L_sp_1024_mont_reduce_32_word\n\t" #else - "BLT.W L_sp_1024_mont_reduce_32_word_%=\n\t" + "BLT.W L_sp_1024_mont_reduce_32_word\n\t" #endif /* Loop Done */ "STR r6, [%[a]]\n\t" @@ -70987,7 +71199,7 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "MOV r8, #0x0\n\t" "MOV r4, #0x0\n\t" "\n" - "L_sp_1024_cond_add_32_words_%=:\n\t" + "L_sp_1024_cond_add_32_words:\n\t" "ADDS r5, r5, #0xffffffff\n\t" "LDR r6, [%[a], r4]\n\t" "LDR r7, [%[b], r4]\n\t" @@ -70998,9 +71210,9 @@ static sp_digit sp_1024_cond_add_32(sp_digit* r, const sp_digit* a, const sp_dig "ADD r4, r4, #0x4\n\t" "CMP r4, #0x80\n\t" #if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) - "BLT L_sp_1024_cond_add_32_words_%=\n\t" + "BLT L_sp_1024_cond_add_32_words\n\t" #else - "BLT.N L_sp_1024_cond_add_32_words_%=\n\t" + "BLT.N L_sp_1024_cond_add_32_words\n\t" #endif "MOV %[r], r5\n\t" : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m) diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c index 916a32fbf..e0e1495b9 100644 --- a/wolfcrypt/src/sp_x86_64.c +++ b/wolfcrypt/src/sp_x86_64.c @@ -55,6 +55,7 @@ #ifdef __IAR_SYSTEMS_ICC__ #define __asm__ asm #define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG #endif /* __IAR_SYSTEMS_ICC__ */ #ifdef __KEIL__ #define __asm__ __asm