From f8bb889712c3ce1d7170454d98db2c6f3f9a9a62 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 15 May 2025 08:58:40 +1000 Subject: [PATCH] Armv8 (Aarch64) ASM fixes for Green Hills compiler Change branch instructions to proper form. Use constant value rather than POLY1305_BLOCK_SIZE. Remove duplicate clobber registers - both w and x versions. Make clamp unconditionally compiled. --- wolfcrypt/src/port/arm/armv8-aes.c | 82 ++++++++++----------- wolfcrypt/src/port/arm/armv8-curve25519_c.c | 46 ++++++------ wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c | 40 +++++----- wolfcrypt/src/port/arm/armv8-poly1305.c | 37 ++++------ wolfcrypt/src/port/arm/armv8-sha3-asm_c.c | 4 +- wolfcrypt/src/port/arm/armv8-sha512-asm_c.c | 6 +- wolfcrypt/src/sp_arm64.c | 24 +++--- 7 files changed, 117 insertions(+), 122 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 13fdaae2b..88b9d3794 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -978,9 +978,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP w11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP w11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "MOV v0.16b, v13.16b \n" "AESE v0.16b, v1.16b \n" @@ -1129,9 +1129,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP w11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP w11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "MOV v0.16b, v15.16b \n" "AESE v0.16b, v1.16b \n" @@ -1295,9 +1295,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP w11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP w11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "MOV v0.16b, v17.16b \n" "AESE v0.16b, v1.16b \n" @@ -22274,7 +22274,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "MOV r12, %[R] \n" "CMP r12, #10 \n" - "BEQ 1f \n" + "B.EQ 1f \n" "VLD1.32 {q1}, [%[Key]]! \n" "AESMC.8 q0, q0\n" "VLD1.32 {q2}, [%[Key]]! \n" @@ -22283,7 +22283,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "AESE.8 q0, q2\n" "CMP r12, #12 \n" - "BEQ 1f \n" + "B.EQ 1f \n" "VLD1.32 {q1}, [%[Key]]! \n" "AESMC.8 q0, q0\n" "VLD1.32 {q2}, [%[Key]]! \n" @@ -22350,7 +22350,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "MOV r12, %[R] \n" "CMP r12, #10 \n" - "BEQ 1f \n" + "B.EQ 1f \n" "VLD1.32 {q1}, [%[Key]]! \n" "AESIMC.8 q0, q0\n" "VLD1.32 {q2}, [%[Key]]! \n" @@ -22359,7 +22359,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "AESD.8 q0, q2\n" "CMP r12, #12 \n" - "BEQ 1f \n" + "B.EQ 1f \n" "VLD1.32 {q1}, [%[Key]]! \n" "AESIMC.8 q0, q0\n" "VLD1.32 {q2}, [%[Key]]! \n" @@ -22462,7 +22462,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "VST1.32 {q0}, [%[out]]! \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q12}, [%[input]]! \n" "B 1b \n" @@ -22529,7 +22529,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "VST1.32 {q0}, [%[out]]! \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q12}, [%[input]]! \n" "B 1b \n" @@ -22603,7 +22603,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "SUB %[Key], %[Key], #16 \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q12}, [%[input]]! \n" "B 1b \n" @@ -22701,7 +22701,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "VMOV.32 q13, q12 \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q0}, [%[input]]! \n" "B 1b \n" @@ -22770,7 +22770,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "VMOV.32 q14, q15 \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q0}, [%[input]]! \n" "B 1b \n" @@ -22846,7 +22846,7 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, "VMOV.32 q14, q15 \n" "CMP r11, #0 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "VLD1.32 {q0}, [%[input]]! \n" "B 1b \n" @@ -22902,9 +22902,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP r11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP r11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "VMOV.32 q0, q13 \n" "AESE.8 q0, q1\n" @@ -23066,9 +23066,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP r11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP r11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "VMOV.32 q0, q13\n" "AESE.8 q0, q1\n" @@ -23252,9 +23252,9 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in, /* double block */ "1: \n" "CMP r11, #1 \n" - "BEQ 2f \n" + "B.EQ 2f \n" "CMP r11, #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "VMOV.32 q0, q13 \n" "AESE.8 q0, q1\n" @@ -25017,11 +25017,11 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "# Put last 2 blocks of keys based on rounds into v14, v15\n" "SUBS WZR, %w[rounds], #14 \n" - "BEQ 40f \n" + "B.EQ 40f \n" "SUBS WZR, %w[rounds], #12 \n" "MOV v14.16b, v12.16b \n" "MOV v15.16b, v13.16b \n" - "BEQ 40f \n" + "B.EQ 40f \n" "MOV v14.16b, v10.16b \n" "MOV v15.16b, v11.16b \n" "40: \n" @@ -25041,17 +25041,17 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "# Put last 2 blocks of keys based on rounds into v14, v15\n" "SUBS WZR, %w[rounds], #14 \n" - "BEQ 41f \n" + "B.EQ 41f \n" "SUBS WZR, %w[rounds], #10 \n" "MOV v14.16b, v10.16b \n" "MOV v15.16b, v11.16b \n" - "BEQ 41f \n" + "B.EQ 41f \n" "MOV v14.16b, v12.16b \n" "MOV v15.16b, v13.16b \n" "41: \n" "SUBS WZR, %w[blocks], #4 \n" - "BLT 1f \n" + "B.LT 1f \n" "AND %w[sz], %w[sz], 0x3f \n" @@ -25174,7 +25174,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "AESMC v19.16b, v19.16b \n" "SUBS WZR, %w[rounds], #10 \n" - "BEQ 21f \n" + "B.EQ 21f \n" "AESE v16.16b, v10.16b \n" "AESMC v16.16b, v16.16b \n" "AESE v17.16b, v10.16b \n" @@ -25193,7 +25193,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "AESMC v19.16b, v19.16b \n" "SUBS WZR, %w[rounds], #12 \n" - "BEQ 21f \n" + "B.EQ 21f \n" "AESE v16.16b, v12.16b \n" "AESMC v16.16b, v16.16b \n" "AESE v17.16b, v12.16b \n" @@ -25231,7 +25231,7 @@ void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "ST1 {v16.16b-v19.16b}, [%[out]], #64 \n" "SUBS %w[blocks], %w[blocks], #4 \n" - "BGE 20b \n" + "B.GE 20b \n" "ADD %w[blocks], %w[blocks], #4 \n" "CBZ %w[sz], 3f \n" @@ -25340,11 +25340,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "LD1 {v13.2d-v15.2d}, [x10] \n" "SUBS WZR, %w[rounds], #14 \n" - "BEQ 40f \n" + "B.EQ 40f \n" "SUBS WZR, %w[rounds], #12 \n" "MOV v14.16b, v12.16b \n" "MOV v15.16b, v13.16b \n" - "BEQ 40f \n" + "B.EQ 40f \n" "MOV v14.16b, v10.16b \n" "MOV v15.16b, v11.16b \n" "40: \n" @@ -25362,11 +25362,11 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "LD1 {v13.2d-v15.2d}, [x11] \n" "SUBS WZR, %w[rounds], #14 \n" - "BEQ 41f \n" + "B.EQ 41f \n" "SUBS WZR, %w[rounds], #12 \n" "MOV v14.16b, v12.16b \n" "MOV v15.16b, v13.16b \n" - "BEQ 41f \n" + "B.EQ 41f \n" "MOV v14.16b, v10.16b \n" "MOV v15.16b, v11.16b \n" "41: \n" @@ -25374,7 +25374,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "CBZ %w[blocks], 3f \n" "SUBS WZR, %w[blocks], #4 \n" - "BLT 1f \n" + "B.LT 1f \n" "AND x17, x19, x10, ASR #63\n" "EXTR x12, x10, x9, #63 \n" @@ -25495,7 +25495,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "AESIMC v19.16b, v19.16b \n" "SUBS WZR, %w[rounds], #10 \n" - "BEQ 21f \n" + "B.EQ 21f \n" "AESD v16.16b, v10.16b \n" "AESIMC v16.16b, v16.16b \n" "AESD v17.16b, v10.16b \n" @@ -25514,7 +25514,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "AESIMC v19.16b, v19.16b \n" "SUBS WZR, %w[rounds], #12 \n" - "BEQ 21f \n" + "B.EQ 21f \n" "AESD v16.16b, v12.16b \n" "AESIMC v16.16b, v16.16b \n" "AESD v17.16b, v12.16b \n" @@ -25553,7 +25553,7 @@ void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz, "SUBS %w[blocks], %w[blocks], #4 \n" "SUB %w[sz], %w[sz], #64 \n" - "BGE 20b \n" + "B.GE 20b \n" "ADD %w[blocks], %w[blocks], #4 \n" "CBZ %w[sz], 4f \n" @@ -25914,7 +25914,7 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, "BGT 1b \n" "CMP %[sz], #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "30: \n" "#Partial block \n" @@ -26026,7 +26026,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, "VLD1.32 {d18, d19}, [%[key2]]! \n" "CMP %[blocks], #0 \n" - "BEQ 3f \n" + "B.EQ 3f \n" "1: \n" "VLD1.32 {q0}, [%[in]]! \n" @@ -26050,7 +26050,7 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, "BGT 1b \n" "CMP %[sz], #0 \n" - "BEQ 4f \n" + "B.EQ 4f \n" "3: \n" diff --git a/wolfcrypt/src/port/arm/armv8-curve25519_c.c b/wolfcrypt/src/port/arm/armv8-curve25519_c.c index d3d119fed..852671189 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519_c.c +++ b/wolfcrypt/src/port/arm/armv8-curve25519_c.c @@ -868,7 +868,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert1_%=\n\t" + "b.ne L_fe_invert1_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -969,7 +969,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert2_%=\n\t" + "b.ne L_fe_invert2_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -1070,7 +1070,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert3_%=\n\t" + "b.ne L_fe_invert3_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -1171,7 +1171,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert4_%=\n\t" + "b.ne L_fe_invert4_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -1270,7 +1270,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert5_%=\n\t" + "b.ne L_fe_invert5_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -1371,7 +1371,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert6_%=\n\t" + "b.ne L_fe_invert6_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -1472,7 +1472,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert7_%=\n\t" + "b.ne L_fe_invert7_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -1571,7 +1571,7 @@ void fe_invert(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x20, x20, #1\n\t" - "bne L_fe_invert8_%=\n\t" + "b.ne L_fe_invert8_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" @@ -2830,7 +2830,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_1_%=\n\t" + "b.ne L_curve25519_inv_1_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -2931,7 +2931,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_2_%=\n\t" + "b.ne L_curve25519_inv_2_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -3032,7 +3032,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_3_%=\n\t" + "b.ne L_curve25519_inv_3_%=\n\t" /* Store */ "stp x6, x7, [x29, #144]\n\t" "stp x8, x9, [x29, #160]\n\t" @@ -3133,7 +3133,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_4_%=\n\t" + "b.ne L_curve25519_inv_4_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -3232,7 +3232,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_5_%=\n\t" + "b.ne L_curve25519_inv_5_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -3333,7 +3333,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_6_%=\n\t" + "b.ne L_curve25519_inv_6_%=\n\t" /* Store */ "stp x6, x7, [x29, #144]\n\t" "stp x8, x9, [x29, #160]\n\t" @@ -3434,7 +3434,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_7_%=\n\t" + "b.ne L_curve25519_inv_7_%=\n\t" /* Store */ "stp x6, x7, [x29, #112]\n\t" "stp x8, x9, [x29, #128]\n\t" @@ -3533,7 +3533,7 @@ int curve25519(byte* r, const byte* n, const byte* a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x24, x24, #1\n\t" - "bne L_curve25519_inv_8_%=\n\t" + "b.ne L_curve25519_inv_8_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -3854,7 +3854,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_1_%=\n\t" + "b.ne L_fe_pow22523_1_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" @@ -3957,7 +3957,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_2_%=\n\t" + "b.ne L_fe_pow22523_2_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" @@ -4058,7 +4058,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_3_%=\n\t" + "b.ne L_fe_pow22523_3_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -4159,7 +4159,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_4_%=\n\t" + "b.ne L_fe_pow22523_4_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" @@ -4258,7 +4258,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_5_%=\n\t" + "b.ne L_fe_pow22523_5_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" @@ -4359,7 +4359,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_6_%=\n\t" + "b.ne L_fe_pow22523_6_%=\n\t" /* Store */ "stp x6, x7, [x29, #80]\n\t" "stp x8, x9, [x29, #96]\n\t" @@ -4460,7 +4460,7 @@ void fe_pow22523(fe r, const fe a) "adcs x8, x12, x15\n\t" "adc x9, x13, x16\n\t" "subs x23, x23, #1\n\t" - "bne L_fe_pow22523_7_%=\n\t" + "b.ne L_fe_pow22523_7_%=\n\t" /* Store */ "stp x6, x7, [x29, #48]\n\t" "stp x8, x9, [x29, #64]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c index 7b8cf20e0..33a707c01 100644 --- a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c @@ -8553,7 +8553,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz) "orr v10.16b, v10.16b, v2.16b\n\t" "orr v11.16b, v11.16b, v3.16b\n\t" "subs %w[sz], %w[sz], #0x300\n\t" - "beq L_mlkem_aarch64_cmp_neon_done_%=\n\t" + "b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t" "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" "eor v0.16b, v0.16b, v4.16b\n\t" @@ -8605,7 +8605,7 @@ int mlkem_cmp_neon(const byte* a, const byte* b, int sz) "orr v10.16b, v10.16b, v2.16b\n\t" "orr v11.16b, v11.16b, v3.16b\n\t" "subs %w[sz], %w[sz], #0x140\n\t" - "beq L_mlkem_aarch64_cmp_neon_done_%=\n\t" + "b.eq L_mlkem_aarch64_cmp_neon_done_%=\n\t" "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[a]], #0x40\n\t" "ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[b]], #0x40\n\t" "eor v0.16b, v0.16b, v4.16b\n\t" @@ -9278,9 +9278,9 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, "ldr q3, [x5]\n\t" "ldr q2, [x6]\n\t" "subs wzr, %w[len], #0\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "subs wzr, %w[len], #16\n\t" - "blt L_mlkem_rej_uniform_loop_4_%=\n\t" + "b.lt L_mlkem_rej_uniform_loop_4_%=\n\t" "\n" "L_mlkem_rej_uniform_loop_16_%=: \n\t" "ld3 {v4.8b, v5.8b, v6.8b}, [%x[r]], #24\n\t" @@ -9323,17 +9323,17 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, "add %x[p], %x[p], x11, lsl 1\n\t" "add x12, x12, x11\n\t" "subs %w[rLen], %w[rLen], #24\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "sub w10, %w[len], w12\n\t" "subs x10, x10, #16\n\t" - "blt L_mlkem_rej_uniform_loop_4_%=\n\t" + "b.lt L_mlkem_rej_uniform_loop_4_%=\n\t" "b L_mlkem_rej_uniform_loop_16_%=\n\t" "\n" "L_mlkem_rej_uniform_loop_4_%=: \n\t" "subs w10, %w[len], w12\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "subs x10, x10, #4\n\t" - "blt L_mlkem_rej_uniform_loop_lt_4_%=\n\t" + "b.lt L_mlkem_rej_uniform_loop_lt_4_%=\n\t" "ldr x4, [%x[r]], #6\n\t" "lsr x5, x4, #12\n\t" "lsr x6, x4, #24\n\t" @@ -9363,7 +9363,7 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, "cinc %x[p], %x[p], lt\n\t" "cinc x12, x12, lt\n\t" "subs %w[rLen], %w[rLen], #6\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "b L_mlkem_rej_uniform_loop_4_%=\n\t" "\n" "L_mlkem_rej_uniform_loop_lt_4_%=: \n\t" @@ -9381,30 +9381,30 @@ unsigned int mlkem_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, "cinc %x[p], %x[p], lt\n\t" "cinc x12, x12, lt\n\t" "subs wzr, %w[len], w12\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "strh w5, [%x[p]]\n\t" "subs xzr, x5, x13\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc x12, x12, lt\n\t" "subs wzr, %w[len], w12\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "strh w6, [%x[p]]\n\t" "subs xzr, x6, x13\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc x12, x12, lt\n\t" "subs wzr, %w[len], w12\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "strh w7, [%x[p]]\n\t" "subs xzr, x7, x13\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc %x[p], %x[p], lt\n\t" "cinc x12, x12, lt\n\t" "subs wzr, %w[len], w12\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "subs %w[rLen], %w[rLen], #6\n\t" - "beq L_mlkem_rej_uniform_done_%=\n\t" + "b.eq L_mlkem_rej_uniform_done_%=\n\t" "b L_mlkem_rej_uniform_loop_lt_4_%=\n\t" "\n" "L_mlkem_rej_uniform_done_%=: \n\t" @@ -9695,7 +9695,7 @@ void mlkem_sha3_blocksx3_neon(word64* state) "mov v30.d[1], %x[state]\n\t" "eor x1, x1, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" @@ -10037,7 +10037,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "mov v30.d[1], %x[state]\n\t" "eor x2, x2, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" @@ -10379,7 +10379,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "mov v30.d[1], %x[state]\n\t" "eor x2, x2, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" @@ -10785,7 +10785,7 @@ void mlkem_sha3_blocksx3_neon(word64* state) "mov v30.d[1], %x[state]\n\t" "eor x1, x1, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" + "b.ne L_SHA3_transform_blocksx3_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" @@ -11212,7 +11212,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "mov v30.d[1], %x[state]\n\t" "eor x2, x2, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" + "b.ne L_SHA3_shake128_blocksx3_seed_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" @@ -11639,7 +11639,7 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "mov v30.d[1], %x[state]\n\t" "eor x2, x2, %x[state]\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" + "b.ne L_SHA3_shake256_blocksx3_seed_neon_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index f64830599..f3c447dcd 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -48,7 +48,7 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx, { __asm__ __volatile__ ( /* Check for zero bytes to do. */ - "CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t" + "CMP %[bytes], #16 \n\t" "BLO L_poly1305_aarch64_16_done_%= \n\t" "MOV x12, #1 \n\t" @@ -127,8 +127,8 @@ static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx, "ADCS x5, x5, x15\n\t" "ADC x6, x6, xzr\n\t" - "SUBS %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]\n\t" - "ADD %[m], %[m], %[POLY1305_BLOCK_SIZE]\n\t" + "SUBS %[bytes], %[bytes], #16\n\t" + "ADD %[m], %[m], #16\n\t" "BGT L_poly1305_aarch64_16_loop_%=\n\t" /* Base 64 -> Base 26 */ @@ -160,7 +160,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, { __asm__ __volatile__ ( /* If less than 4 blocks to process then use regular method */ - "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "CMP %[bytes], #64 \n\t" "BLO L_poly1305_aarch64_64_done_%= \n\t" "MOV x9, #0x3ffffff \n\t" /* Load h */ @@ -188,7 +188,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "MOV v26.D[0], x9 \n\t" "MOV v26.D[1], x9 \n\t" "DUP v30.4S, v26.S[0] \n\t" - "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" + "CMP %[bytes], #96 \n\t" "BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t" /* Load r^2 to NEON v0, v1, v2, v3, v4 */ "LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t" @@ -229,7 +229,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, /* Load m */ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */ "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" - "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "SUB %[bytes], %[bytes], #64 \n\t" "USHR v14.4S, v13.4S, #8 \n\t" "ORR v14.16B, v14.16B, v30.16B \n\t" "SHL v13.4S, v13.4S, #18 \n\t" @@ -362,12 +362,12 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "UMLAL2 v24.2D, v14.4S, v9.4S \n\t" "UMLAL2 v25.2D, v14.4S, v0.4S \n\t" /* If less than six message blocks left then leave loop */ - "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t" + "CMP %[bytes], #96 \n\t" "BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t" /* Load m */ /* Load four message blocks to NEON v10, v11, v12, v13, v14 */ "LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t" - "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t" + "SUB %[bytes], %[bytes], #64 \n\t" "USHR v14.4S, v13.4S, #8 \n\t" "ORR v14.16B, v14.16B, v30.16B \n\t" "SHL v13.4S, v13.4S, #18 \n\t" @@ -424,7 +424,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" /* Copy r^2 to lower half of registers */ "MOV v0.D[0], v0.D[1] \n\t" - "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "SUB %[bytes], %[bytes], #32 \n\t" "MOV v5.D[0], v5.D[1] \n\t" "USHR v14.2D, v11.2D, #40 \n\t" "MOV v1.D[0], v1.D[1] \n\t" @@ -492,7 +492,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "MOV v18.S[1], v18.S[2] \n\t" "MOV v19.S[1], v19.S[2] \n\t" /* If less than 2 blocks left go straight to final multiplication. */ - "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "CMP %[bytes], #32 \n\t" "BLO L_poly1305_aarch64_64_last_mult_%= \n\t" /* Else go to one loop of L_poly1305_aarch64_64_loop_64 */ "B L_poly1305_aarch64_64_loop_64_%= \n\t" @@ -524,7 +524,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, /* Load m */ /* Load two message blocks to NEON v10, v11, v12, v13, v14 */ "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" - "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "SUB %[bytes], %[bytes], #32 \n\t" "USHR v14.2D, v11.2D, #40 \n\t" "ORR v14.16B, v14.16B, v26.16B \n\t" "USHR v13.2D, v11.2D, #14 \n\t" @@ -616,7 +616,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t" /* Reduce h % P */ "MOV x14, #5 \n\t" - "SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "SUB %[bytes], %[bytes], #32 \n\t" "ADD x10, x10, x9, LSR #26 \n\t" "USHR v14.2D, v11.2D, #40 \n\t" "ADD x13, x13, x12, LSR #26 \n\t" @@ -676,7 +676,7 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "MOV v18.S[1], v18.S[2] \n\t" "MOV v19.S[1], v19.S[2] \n\t" /* If at least two message blocks left then loop_64 */ - "CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t" + "CMP %[bytes], #32 \n\t" "BHS L_poly1305_aarch64_64_loop_64_%= \n\t" "\n" ".align 2 \n\t" @@ -831,11 +831,9 @@ void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17", - "w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", - "w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", - "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", - "x28", "x30" + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", + "x30" ); poly1305_blocks_aarch64_16(ctx, m, bytes); } @@ -845,12 +843,10 @@ void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m) poly1305_blocks_aarch64_16(ctx, m, POLY1305_BLOCK_SIZE); } -#if defined(POLY130564) static word64 clamp[] = { 0x0ffffffc0fffffff, 0x0ffffffc0ffffffc, }; -#endif /* POLY130564 */ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) @@ -1112,7 +1108,6 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) [ctx_r64] "r" (ctx->r64), [ctx_r] "r" (ctx->r), [ctx_r_2] "r" (ctx->r_2), [ctx_r_4] "r" (ctx->r_4) : "memory", "cc", - "w4", "w5", "w6", "w7", "w8", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10" ); diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c index 5dd5196e1..8603b6e57 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -153,7 +153,7 @@ void BlockSha3_crypto(word64* state) "ld1r {v30.2d}, [x1], #8\n\t" "subs x2, x2, #1\n\t" "eor v0.16b, v0.16b, v30.16b\n\t" - "bne L_sha3_crypto_begin_%=\n\t" + "b.ne L_sha3_crypto_begin_%=\n\t" "st4 {v0.d, v1.d, v2.d, v3.d}[0], [%x[state]], #32\n\t" "st4 {v4.d, v5.d, v6.d, v7.d}[0], [%x[state]], #32\n\t" "st4 {v8.d, v9.d, v10.d, v11.d}[0], [%x[state]], #32\n\t" @@ -352,7 +352,7 @@ void BlockSha3_base(word64* state) "ldr %x[state], [x27], #8\n\t" "subs x28, x28, #1\n\t" "eor x1, x1, %x[state]\n\t" - "bne L_SHA3_transform_base_begin_%=\n\t" + "b.ne L_SHA3_transform_base_begin_%=\n\t" "ldr %x[state], [x29, #40]\n\t" "stp x1, x2, [%x[state]]\n\t" "stp x3, x4, [%x[state], #16]\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c index 0697763e6..6a5cebb3c 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c @@ -629,7 +629,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len) "add x8, x8, x4\n\t" "add x4, x4, x14\n\t" "subs x27, x27, #1\n\t" - "bne L_sha512_len_neon_start_%=\n\t" + "b.ne L_sha512_len_neon_start_%=\n\t" /* Round 0 */ "mov x13, v0.d[0]\n\t" "ldr x15, [x3], #8\n\t" @@ -998,7 +998,7 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len) "add x3, x3, %[L_SHA512_transform_neon_len_k]@PAGEOFF\n\t" #endif /* __APPLE__ */ "subs %w[len], %w[len], #0x80\n\t" - "bne L_sha512_len_neon_begin_%=\n\t" + "b.ne L_sha512_len_neon_begin_%=\n\t" "stp x4, x5, [%x[sha512]]\n\t" "stp x6, x7, [%x[sha512], #16]\n\t" "stp x8, x9, [%x[sha512], #32]\n\t" @@ -1576,7 +1576,7 @@ void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, "add v25.2d, v25.2d, v29.2d\n\t" "add v24.2d, v24.2d, v28.2d\n\t" "subs %w[len], %w[len], #0x80\n\t" - "bne L_sha512_len_crypto_begin_%=\n\t" + "b.ne L_sha512_len_crypto_begin_%=\n\t" /* Store digest back */ "st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [%x[sha512]]\n\t" : [sha512] "+r" (sha512), [data] "+r" (data), [len] "+r" (len) diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c index dc9f2960d..10afa099c 100644 --- a/wolfcrypt/src/sp_arm64.c +++ b/wolfcrypt/src/sp_arm64.c @@ -93,7 +93,7 @@ static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -3351,7 +3351,7 @@ SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" "mov x9, %[a]\n\t" @@ -6980,7 +6980,7 @@ static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -16577,7 +16577,7 @@ static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -39659,7 +39659,7 @@ static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -43865,7 +43865,7 @@ SP_NOINLINE static void sp_384_mont_reduce_order_6(sp_digit* a, const sp_digit* "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "neg x3, x3\n\t" "mov x9, %[a]\n\t" @@ -66408,7 +66408,7 @@ static void sp_384_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -72238,7 +72238,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m, "# mu = a[i] * mp\n\t" "mul x9, %[mp], x13\n\t" "cmp x4, #1\n\t" - "bne L_521_mont_reduce_9_nomask\n\t" + "b.ne L_521_mont_reduce_9_nomask\n\t" "and x9, x9, #0x1ff\n\t" "L_521_mont_reduce_9_nomask:\n\t" "# a[i+0] += m[0] * mu\n\t" @@ -72312,7 +72312,7 @@ SP_NOINLINE static void sp_521_mont_reduce_9(sp_digit* a, const sp_digit* m, "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" + "b.ne 1b\n\t" "extr x12, x13, x12, 9\n\t" "extr x13, x14, x13, 9\n\t" "extr x14, x15, x14, 9\n\t" @@ -111555,7 +111555,7 @@ static void sp_521_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t" @@ -115993,7 +115993,7 @@ SP_NOINLINE static void sp_1024_mont_reduce_16(sp_digit* a, const sp_digit* m, "adc x3, x3, xzr\n\t" "subs x4, x4, 1\n\t" "add %[a], %[a], 8\n\t" - "bne 1b\n\t" + "b.ne 1b\n\t" "# Create mask\n\t" "subs x11, x10, x28\n\t" "neg x3, x3\n\t" @@ -125143,7 +125143,7 @@ static void sp_1024_from_bin(sp_digit* r, int size, const byte* a, int n) "sub x4, x4, 8\n\t" "subs x6, %[n], 8\n\t" "mov x7, xzr\n\t" - "blt 2f\n\t" + "b.lt 2f\n\t" /* Put in multiples of 8 bytes. */ "1:\n\t" "ldr x8, [x4], -8\n\t"