From 070923a3737b0554c087048390d577cd9527636e Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 28 Oct 2025 10:05:43 +1000 Subject: [PATCH 1/2] AES ARM ASM: user data loaded 1 reg at a time User key may not be aligned and need to use instructions that don't require alignment. Change to use ldr instead of ldp or ldrd. --- wolfcrypt/src/port/arm/armv8-32-aes-asm.S | 45 +++------------------ wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c | 45 +++------------------ wolfcrypt/src/port/arm/armv8-aes-asm.S | 14 ++++--- wolfcrypt/src/port/arm/armv8-aes-asm_c.c | 14 ++++--- 4 files changed, 30 insertions(+), 88 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S index b79f970c6..4cd585287 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm.S @@ -8596,17 +8596,10 @@ AES_set_encrypt_key: beq L_AES_set_encrypt_key_start_128 cmp r1, #0xc0 beq L_AES_set_encrypt_key_start_192 -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - ldm r0, {r4, r5} -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] ldr r6, [r0, #8] ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # REV r4, r4 eor r3, r4, r4, ror #16 @@ -8635,18 +8628,10 @@ AES_set_encrypt_key: rev r7, r7 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ stm r2!, {r4, r5, r6, r7} -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r4, [r0, #16] ldr r5, [r0, #20] -#else - ldrd r4, r5, [r0, #16] -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r6, [r0, #24] ldr r7, [r0, #28] -#else - ldrd r6, r7, [r0, #24] -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # REV r4, r4 eor r3, r4, r4, ror #16 @@ -8825,23 +8810,12 @@ L_AES_set_encrypt_key_loop_256: sub r2, r2, #16 b L_AES_set_encrypt_key_end L_AES_set_encrypt_key_start_192: -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - ldm r0, {r4, r5} -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] ldr r6, [r0, #8] ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) ldr r1, [r0, #20] ldr r0, [r0, #16] -#else - ldrd r0, r1, [r0, #16] -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # REV r4, r4 eor r3, r4, r4, ror #16 @@ -8989,17 +8963,10 @@ L_AES_set_encrypt_key_loop_192: stm r2, {r0, r1, r4, r5} b L_AES_set_encrypt_key_end L_AES_set_encrypt_key_start_128: -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - ldm r0, {r4, r5} -#else - ldrd r4, r5, [r0] -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + ldr r4, [r0] + ldr r5, [r0, #4] ldr r6, [r0, #8] ldr r7, [r0, #12] -#else - ldrd r6, r7, [r0, #8] -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # REV r4, r4 eor r3, r4, r4, ror #16 diff --git a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c index 6aa6abb06..7ff7e3d24 100644 --- a/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-aes-asm_c.c @@ -8876,17 +8876,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key, "beq L_AES_set_encrypt_key_start_128_%=\n\t" "cmp %[len], #0xc0\n\t" "beq L_AES_set_encrypt_key_start_192_%=\n\t" -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - "ldm r0, {r4, r5}\n\t" -#else - "ldrd r4, r5, [%[key]]\n\t" -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" -#else - "ldrd r6, r7, [%[key], #8]\n\t" -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) /* REV r4, r4 */ "eor r3, r4, r4, ror #16\n\t" @@ -8915,18 +8908,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key, "rev r7, r7\n\t" #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ "stm %[ks]!, {r4, r5, r6, r7}\n\t" -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r4, [%[key], #16]\n\t" "ldr r5, [%[key], #20]\n\t" -#else - "ldrd r4, r5, [%[key], #16]\n\t" -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr r6, [%[key], #24]\n\t" "ldr r7, [%[key], #28]\n\t" -#else - "ldrd r6, r7, [%[key], #24]\n\t" -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) /* REV r4, r4 */ "eor r3, r4, r4, ror #16\n\t" @@ -9107,23 +9092,12 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key, "b L_AES_set_encrypt_key_end_%=\n\t" "\n" "L_AES_set_encrypt_key_start_192_%=: \n\t" -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - "ldm r0, {r4, r5}\n\t" -#else - "ldrd r4, r5, [%[key]]\n\t" -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" -#else - "ldrd r6, r7, [%[key], #8]\n\t" -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) "ldr %[len], [%[key], #20]\n\t" "ldr %[key], [%[key], #16]\n\t" -#else - "ldrd %[key], %[len], [%[key], #16]\n\t" -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) /* REV r4, r4 */ "eor r3, r4, r4, ror #16\n\t" @@ -9273,17 +9247,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key, "b L_AES_set_encrypt_key_end_%=\n\t" "\n" "L_AES_set_encrypt_key_start_128_%=: \n\t" -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) - "ldm r0, {r4, r5}\n\t" -#else - "ldrd r4, r5, [%[key]]\n\t" -#endif -#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7) + "ldr r4, [%[key]]\n\t" + "ldr r5, [%[key], #4]\n\t" "ldr r6, [%[key], #8]\n\t" "ldr r7, [%[key], #12]\n\t" -#else - "ldrd r6, r7, [%[key], #8]\n\t" -#endif #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) /* REV r4, r4 */ "eor r3, r4, r4, ror #16\n\t" diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S index c3d7b0590..1ac5b953b 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S @@ -46,8 +46,9 @@ _AES_set_key_AARCH64: cmp x1, #24 blt L_aes_set_key_arm64_crypto_start_128 bgt L_aes_set_key_arm64_crypto_start_256 - ldp x4, x6, [x0], #16 - ldr x8, [x0] + ldr x4, [x0], #8 + ldr x6, [x0], #8 + ldr x8, [x0], #8 stp x4, x6, [x2], #16 str x8, [x2], #8 lsr x5, x4, #32 @@ -212,8 +213,10 @@ _AES_set_key_AARCH64: stur q0, [x2, #96] b L_aes_set_key_arm64_crypto_done L_aes_set_key_arm64_crypto_start_256: - ldp x4, x6, [x0], #16 - ldp x8, x10, [x0], #16 + ldr x4, [x0], #8 + ldr x6, [x0], #8 + ldr x8, [x0], #8 + ldr x10, [x0], #8 stp x4, x6, [x2], #16 stp x8, x10, [x2], #16 lsr x5, x4, #32 @@ -412,7 +415,8 @@ L_aes_set_key_arm64_crypto_start_256: stur q0, [x2, #112] b L_aes_set_key_arm64_crypto_done L_aes_set_key_arm64_crypto_start_128: - ldp x4, x6, [x0], #16 + ldr x4, [x0], #8 + ldr x6, [x0], #8 stp x4, x6, [x2], #16 lsr x5, x4, #32 lsr x7, x6, #32 diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c index d567b23e3..e76ad8e1a 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c @@ -40,8 +40,9 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir) "cmp %x[keylen], #24\n\t" "b.lt L_aes_set_key_arm64_crypto_start_128_%=\n\t" "b.gt L_aes_set_key_arm64_crypto_start_256_%=\n\t" - "ldp x4, x6, [%x[userKey]], #16\n\t" - "ldr x8, [%x[userKey]]\n\t" + "ldr x4, [%x[userKey]], #8\n\t" + "ldr x6, [%x[userKey]], #8\n\t" + "ldr x8, [%x[userKey]], #8\n\t" "stp x4, x6, [%x[key]], #16\n\t" "str x8, [%x[key]], #8\n\t" "lsr x5, x4, #32\n\t" @@ -207,8 +208,10 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir) "b L_aes_set_key_arm64_crypto_done_%=\n\t" "\n" "L_aes_set_key_arm64_crypto_start_256_%=: \n\t" - "ldp x4, x6, [%x[userKey]], #16\n\t" - "ldp x8, x10, [%x[userKey]], #16\n\t" + "ldr x4, [%x[userKey]], #8\n\t" + "ldr x6, [%x[userKey]], #8\n\t" + "ldr x8, [%x[userKey]], #8\n\t" + "ldr x10, [%x[userKey]], #8\n\t" "stp x4, x6, [%x[key]], #16\n\t" "stp x8, x10, [%x[key]], #16\n\t" "lsr x5, x4, #32\n\t" @@ -408,7 +411,8 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir) "b L_aes_set_key_arm64_crypto_done_%=\n\t" "\n" "L_aes_set_key_arm64_crypto_start_128_%=: \n\t" - "ldp x4, x6, [%x[userKey]], #16\n\t" + "ldr x4, [%x[userKey]], #8\n\t" + "ldr x6, [%x[userKey]], #8\n\t" "stp x4, x6, [%x[key]], #16\n\t" "lsr x5, x4, #32\n\t" "lsr x7, x6, #32\n\t" From d883a950d2ed6190b233003f3b66db34c7d24a2b Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Tue, 28 Oct 2025 12:04:12 +1000 Subject: [PATCH 2/2] ML-KEM SHA-3: fix r Constant r wasn't being loaded into register in all assembly functions that use it - it just got lucky most of the time. --- wolfcrypt/src/port/arm/armv8-mlkem-asm.S | 14 +++ wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c | 130 +++++++++++---------- 2 files changed, 80 insertions(+), 64 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S index dc92a5225..a45475c9f 100644 --- a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S +++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S @@ -10074,6 +10074,13 @@ _mlkem_shake128_blocksx3_seed_neon: stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] @@ -10414,6 +10421,13 @@ _mlkem_shake256_blocksx3_seed_neon: stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c index 3ca763809..fe092576c 100644 --- a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c @@ -9417,6 +9417,7 @@ void mlkem_sha3_blocksx3_neon(word64* state) void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) { + const word64* r = L_sha3_aarch64_r; __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" @@ -9476,57 +9477,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) /* Start of 24 rounds */ "\n" "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" - "stp x28, %x[seed], [x29, #48]\n\t" + "stp %[r], %x[seed], [x29, #48]\n\t" /* Col Mix */ "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" "eor %x[state], x6, x11\n\t" "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" "eor x30, x2, x7\n\t" "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" - "eor x28, x4, x9\n\t" + "eor %[r], x4, x9\n\t" "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" "eor %x[state], %x[state], x16\n\t" "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" "eor x30, x30, x12\n\t" "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" - "eor x28, x28, x14\n\t" + "eor %[r], %[r], x14\n\t" "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" "eor %x[state], %x[state], x22\n\t" "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" "eor x30, x30, x17\n\t" "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" - "eor x28, x28, x20\n\t" + "eor %[r], %[r], x20\n\t" "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" "eor %x[state], %x[state], x27\n\t" "rax1 v25.2d, v30.2d, v27.2d\n\t" "eor x30, x30, x23\n\t" "rax1 v26.2d, v31.2d, v28.2d\n\t" - "eor x28, x28, x25\n\t" + "eor %[r], %[r], x25\n\t" "rax1 v27.2d, v27.2d, v29.2d\n\t" "str %x[state], [x29, #32]\n\t" "rax1 v28.2d, v28.2d, v30.2d\n\t" - "str x28, [x29, #24]\n\t" + "str %[r], [x29, #24]\n\t" "rax1 v29.2d, v29.2d, v31.2d\n\t" "eor %x[seed], x3, x8\n\t" "eor v0.16b, v0.16b, v25.16b\n\t" "xar v30.2d, v1.2d, v26.2d, #63\n\t" - "eor x28, x5, x10\n\t" + "eor %[r], x5, x10\n\t" "xar v1.2d, v6.2d, v26.2d, #20\n\t" "eor %x[seed], %x[seed], x13\n\t" "xar v6.2d, v9.2d, v29.2d, #44\n\t" - "eor x28, x28, x15\n\t" + "eor %[r], %[r], x15\n\t" "xar v9.2d, v22.2d, v27.2d, #3\n\t" "eor %x[seed], %x[seed], x19\n\t" "xar v22.2d, v14.2d, v29.2d, #25\n\t" - "eor x28, x28, x21\n\t" + "eor %[r], %[r], x21\n\t" "xar v14.2d, v20.2d, v25.2d, #46\n\t" "eor %x[seed], %x[seed], x24\n\t" "xar v20.2d, v2.2d, v27.2d, #2\n\t" - "eor x28, x28, x26\n\t" + "eor %[r], %[r], x26\n\t" "xar v2.2d, v12.2d, v27.2d, #21\n\t" "eor %x[state], %x[state], %x[seed], ror 63\n\t" "xar v12.2d, v13.2d, v28.2d, #39\n\t" - "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor %x[seed], %x[seed], %[r], ror 63\n\t" "xar v13.2d, v19.2d, v29.2d, #56\n\t" "eor x2, x2, %x[state]\n\t" "xar v19.2d, v23.2d, v28.2d, #8\n\t" @@ -9552,22 +9553,22 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "xar v18.2d, v17.2d, v27.2d, #49\n\t" "ldr %x[seed], [x29, #24]\n\t" "xar v17.2d, v11.2d, v26.2d, #54\n\t" - "eor x28, x28, x30, ror 63\n\t" + "eor %[r], %[r], x30, ror 63\n\t" "xar v11.2d, v7.2d, v27.2d, #58\n\t" "eor x30, x30, %x[seed], ror 63\n\t" "xar v7.2d, v10.2d, v25.2d, #61\n\t" "eor %x[seed], %x[seed], %x[state], ror 63\n\t" /* Row Mix */ "mov v25.16b, v0.16b\n\t" - "eor x6, x6, x28\n\t" + "eor x6, x6, %[r]\n\t" "mov v26.16b, v1.16b\n\t" - "eor x11, x11, x28\n\t" + "eor x11, x11, %[r]\n\t" "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" - "eor x16, x16, x28\n\t" + "eor x16, x16, %[r]\n\t" "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" - "eor x22, x22, x28\n\t" + "eor x22, x22, %[r]\n\t" "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" - "eor x27, x27, x28\n\t" + "eor x27, x27, %[r]\n\t" "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" "eor x3, x3, x30\n\t" "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" @@ -9635,57 +9636,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) /* Row Mix Base */ "bic x12, x4, x3\n\t" "bic %x[seed], x5, x4\n\t" - "bic x28, x2, x6\n\t" + "bic %[r], x2, x6\n\t" "bic x30, x3, x2\n\t" "eor x2, x2, x12\n\t" "eor x3, x3, %x[seed]\n\t" "bic x12, x6, x5\n\t" - "eor x5, x5, x28\n\t" + "eor x5, x5, %[r]\n\t" "eor x4, x4, x12\n\t" "eor x6, x6, x30\n\t" "bic x12, x9, x8\n\t" "bic %x[seed], x10, x9\n\t" - "bic x28, x7, x11\n\t" + "bic %[r], x7, x11\n\t" "bic x30, x8, x7\n\t" "eor x7, x7, x12\n\t" "eor x8, x8, %x[seed]\n\t" "bic x12, x11, x10\n\t" - "eor x10, x10, x28\n\t" + "eor x10, x10, %[r]\n\t" "eor x9, x9, x12\n\t" "eor x11, x11, x30\n\t" "bic x12, x14, x13\n\t" "bic %x[seed], x15, x14\n\t" - "bic x28, %x[state], x16\n\t" + "bic %[r], %x[state], x16\n\t" "bic x30, x13, %x[state]\n\t" "eor x12, %x[state], x12\n\t" "eor x13, x13, %x[seed]\n\t" "bic %x[state], x16, x15\n\t" - "eor x15, x15, x28\n\t" + "eor x15, x15, %[r]\n\t" "eor x14, x14, %x[state]\n\t" "eor x16, x16, x30\n\t" "bic %x[state], x20, x19\n\t" "bic %x[seed], x21, x20\n\t" - "bic x28, x17, x22\n\t" + "bic %[r], x17, x22\n\t" "bic x30, x19, x17\n\t" "eor x17, x17, %x[state]\n\t" "eor x19, x19, %x[seed]\n\t" "bic %x[state], x22, x21\n\t" - "eor x21, x21, x28\n\t" + "eor x21, x21, %[r]\n\t" "eor x20, x20, %x[state]\n\t" "eor x22, x22, x30\n\t" "bic %x[state], x25, x24\n\t" "bic %x[seed], x26, x25\n\t" - "bic x28, x23, x27\n\t" + "bic %[r], x23, x27\n\t" "bic x30, x24, x23\n\t" "eor x23, x23, %x[state]\n\t" "eor x24, x24, %x[seed]\n\t" "bic %x[state], x27, x26\n\t" - "eor x26, x26, x28\n\t" + "eor x26, x26, %[r]\n\t" "eor x25, x25, %x[state]\n\t" "eor x27, x27, x30\n\t" /* Done transforming */ - "ldp x28, %x[seed], [x29, #48]\n\t" - "ldr %x[state], [x28], #8\n\t" + "ldp %[r], %x[seed], [x29, #48]\n\t" + "ldr %x[state], [%[r]], #8\n\t" "subs %x[seed], %x[seed], #1\n\t" "mov v30.d[0], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t" @@ -9724,11 +9725,11 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "str x27, [%x[state], #192]\n\t" "ldp x29, x30, [sp], #0x40\n\t" : [state] "+r" (state), [seed] "+r" (seed) - : + : [r] "r" (r) : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", - "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", - "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", + "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" @@ -9737,6 +9738,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) { + const word64* r = L_sha3_aarch64_r; __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" @@ -9796,57 +9798,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) /* Start of 24 rounds */ "\n" "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" - "stp x28, %x[seed], [x29, #48]\n\t" + "stp %[r], %x[seed], [x29, #48]\n\t" /* Col Mix */ "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" "eor %x[state], x6, x11\n\t" "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" "eor x30, x2, x7\n\t" "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" - "eor x28, x4, x9\n\t" + "eor %[r], x4, x9\n\t" "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" "eor %x[state], %x[state], x16\n\t" "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" "eor x30, x30, x12\n\t" "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" - "eor x28, x28, x14\n\t" + "eor %[r], %[r], x14\n\t" "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" "eor %x[state], %x[state], x22\n\t" "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" "eor x30, x30, x17\n\t" "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" - "eor x28, x28, x20\n\t" + "eor %[r], %[r], x20\n\t" "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" "eor %x[state], %x[state], x27\n\t" "rax1 v25.2d, v30.2d, v27.2d\n\t" "eor x30, x30, x23\n\t" "rax1 v26.2d, v31.2d, v28.2d\n\t" - "eor x28, x28, x25\n\t" + "eor %[r], %[r], x25\n\t" "rax1 v27.2d, v27.2d, v29.2d\n\t" "str %x[state], [x29, #32]\n\t" "rax1 v28.2d, v28.2d, v30.2d\n\t" - "str x28, [x29, #24]\n\t" + "str %[r], [x29, #24]\n\t" "rax1 v29.2d, v29.2d, v31.2d\n\t" "eor %x[seed], x3, x8\n\t" "eor v0.16b, v0.16b, v25.16b\n\t" "xar v30.2d, v1.2d, v26.2d, #63\n\t" - "eor x28, x5, x10\n\t" + "eor %[r], x5, x10\n\t" "xar v1.2d, v6.2d, v26.2d, #20\n\t" "eor %x[seed], %x[seed], x13\n\t" "xar v6.2d, v9.2d, v29.2d, #44\n\t" - "eor x28, x28, x15\n\t" + "eor %[r], %[r], x15\n\t" "xar v9.2d, v22.2d, v27.2d, #3\n\t" "eor %x[seed], %x[seed], x19\n\t" "xar v22.2d, v14.2d, v29.2d, #25\n\t" - "eor x28, x28, x21\n\t" + "eor %[r], %[r], x21\n\t" "xar v14.2d, v20.2d, v25.2d, #46\n\t" "eor %x[seed], %x[seed], x24\n\t" "xar v20.2d, v2.2d, v27.2d, #2\n\t" - "eor x28, x28, x26\n\t" + "eor %[r], %[r], x26\n\t" "xar v2.2d, v12.2d, v27.2d, #21\n\t" "eor %x[state], %x[state], %x[seed], ror 63\n\t" "xar v12.2d, v13.2d, v28.2d, #39\n\t" - "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor %x[seed], %x[seed], %[r], ror 63\n\t" "xar v13.2d, v19.2d, v29.2d, #56\n\t" "eor x2, x2, %x[state]\n\t" "xar v19.2d, v23.2d, v28.2d, #8\n\t" @@ -9872,22 +9874,22 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "xar v18.2d, v17.2d, v27.2d, #49\n\t" "ldr %x[seed], [x29, #24]\n\t" "xar v17.2d, v11.2d, v26.2d, #54\n\t" - "eor x28, x28, x30, ror 63\n\t" + "eor %[r], %[r], x30, ror 63\n\t" "xar v11.2d, v7.2d, v27.2d, #58\n\t" "eor x30, x30, %x[seed], ror 63\n\t" "xar v7.2d, v10.2d, v25.2d, #61\n\t" "eor %x[seed], %x[seed], %x[state], ror 63\n\t" /* Row Mix */ "mov v25.16b, v0.16b\n\t" - "eor x6, x6, x28\n\t" + "eor x6, x6, %[r]\n\t" "mov v26.16b, v1.16b\n\t" - "eor x11, x11, x28\n\t" + "eor x11, x11, %[r]\n\t" "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" - "eor x16, x16, x28\n\t" + "eor x16, x16, %[r]\n\t" "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" - "eor x22, x22, x28\n\t" + "eor x22, x22, %[r]\n\t" "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" - "eor x27, x27, x28\n\t" + "eor x27, x27, %[r]\n\t" "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" "eor x3, x3, x30\n\t" "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" @@ -9955,57 +9957,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) /* Row Mix Base */ "bic x12, x4, x3\n\t" "bic %x[seed], x5, x4\n\t" - "bic x28, x2, x6\n\t" + "bic %[r], x2, x6\n\t" "bic x30, x3, x2\n\t" "eor x2, x2, x12\n\t" "eor x3, x3, %x[seed]\n\t" "bic x12, x6, x5\n\t" - "eor x5, x5, x28\n\t" + "eor x5, x5, %[r]\n\t" "eor x4, x4, x12\n\t" "eor x6, x6, x30\n\t" "bic x12, x9, x8\n\t" "bic %x[seed], x10, x9\n\t" - "bic x28, x7, x11\n\t" + "bic %[r], x7, x11\n\t" "bic x30, x8, x7\n\t" "eor x7, x7, x12\n\t" "eor x8, x8, %x[seed]\n\t" "bic x12, x11, x10\n\t" - "eor x10, x10, x28\n\t" + "eor x10, x10, %[r]\n\t" "eor x9, x9, x12\n\t" "eor x11, x11, x30\n\t" "bic x12, x14, x13\n\t" "bic %x[seed], x15, x14\n\t" - "bic x28, %x[state], x16\n\t" + "bic %[r], %x[state], x16\n\t" "bic x30, x13, %x[state]\n\t" "eor x12, %x[state], x12\n\t" "eor x13, x13, %x[seed]\n\t" "bic %x[state], x16, x15\n\t" - "eor x15, x15, x28\n\t" + "eor x15, x15, %[r]\n\t" "eor x14, x14, %x[state]\n\t" "eor x16, x16, x30\n\t" "bic %x[state], x20, x19\n\t" "bic %x[seed], x21, x20\n\t" - "bic x28, x17, x22\n\t" + "bic %[r], x17, x22\n\t" "bic x30, x19, x17\n\t" "eor x17, x17, %x[state]\n\t" "eor x19, x19, %x[seed]\n\t" "bic %x[state], x22, x21\n\t" - "eor x21, x21, x28\n\t" + "eor x21, x21, %[r]\n\t" "eor x20, x20, %x[state]\n\t" "eor x22, x22, x30\n\t" "bic %x[state], x25, x24\n\t" "bic %x[seed], x26, x25\n\t" - "bic x28, x23, x27\n\t" + "bic %[r], x23, x27\n\t" "bic x30, x24, x23\n\t" "eor x23, x23, %x[state]\n\t" "eor x24, x24, %x[seed]\n\t" "bic %x[state], x27, x26\n\t" - "eor x26, x26, x28\n\t" + "eor x26, x26, %[r]\n\t" "eor x25, x25, %x[state]\n\t" "eor x27, x27, x30\n\t" /* Done transforming */ - "ldp x28, %x[seed], [x29, #48]\n\t" - "ldr %x[state], [x28], #8\n\t" + "ldp %[r], %x[seed], [x29, #48]\n\t" + "ldr %x[state], [%[r]], #8\n\t" "subs %x[seed], %x[seed], #1\n\t" "mov v30.d[0], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t" @@ -10044,11 +10046,11 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "str x27, [%x[state], #192]\n\t" "ldp x29, x30, [sp], #0x40\n\t" : [state] "+r" (state), [seed] "+r" (seed) - : + : [r] "r" (r) : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", - "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", - "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", + "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"