diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S index dc92a5225..a45475c9f 100644 --- a/wolfcrypt/src/port/arm/armv8-mlkem-asm.S +++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm.S @@ -10074,6 +10074,13 @@ _mlkem_shake128_blocksx3_seed_neon: stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] @@ -10414,6 +10421,13 @@ _mlkem_shake256_blocksx3_seed_neon: stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] +#ifndef __APPLE__ + adrp x28, L_sha3_aarch64_r + add x28, x28, :lo12:L_sha3_aarch64_r +#else + adrp x28, L_sha3_aarch64_r@PAGE + add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF +#endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] diff --git a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c index 3ca763809..fe092576c 100644 --- a/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-mlkem-asm_c.c @@ -9417,6 +9417,7 @@ void mlkem_sha3_blocksx3_neon(word64* state) void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) { + const word64* r = L_sha3_aarch64_r; __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" @@ -9476,57 +9477,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) /* Start of 24 rounds */ "\n" "L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t" - "stp x28, %x[seed], [x29, #48]\n\t" + "stp %[r], %x[seed], [x29, #48]\n\t" /* Col Mix */ "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" "eor %x[state], x6, x11\n\t" "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" "eor x30, x2, x7\n\t" "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" - "eor x28, x4, x9\n\t" + "eor %[r], x4, x9\n\t" "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" "eor %x[state], %x[state], x16\n\t" "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" "eor x30, x30, x12\n\t" "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" - "eor x28, x28, x14\n\t" + "eor %[r], %[r], x14\n\t" "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" "eor %x[state], %x[state], x22\n\t" "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" "eor x30, x30, x17\n\t" "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" - "eor x28, x28, x20\n\t" + "eor %[r], %[r], x20\n\t" "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" "eor %x[state], %x[state], x27\n\t" "rax1 v25.2d, v30.2d, v27.2d\n\t" "eor x30, x30, x23\n\t" "rax1 v26.2d, v31.2d, v28.2d\n\t" - "eor x28, x28, x25\n\t" + "eor %[r], %[r], x25\n\t" "rax1 v27.2d, v27.2d, v29.2d\n\t" "str %x[state], [x29, #32]\n\t" "rax1 v28.2d, v28.2d, v30.2d\n\t" - "str x28, [x29, #24]\n\t" + "str %[r], [x29, #24]\n\t" "rax1 v29.2d, v29.2d, v31.2d\n\t" "eor %x[seed], x3, x8\n\t" "eor v0.16b, v0.16b, v25.16b\n\t" "xar v30.2d, v1.2d, v26.2d, #63\n\t" - "eor x28, x5, x10\n\t" + "eor %[r], x5, x10\n\t" "xar v1.2d, v6.2d, v26.2d, #20\n\t" "eor %x[seed], %x[seed], x13\n\t" "xar v6.2d, v9.2d, v29.2d, #44\n\t" - "eor x28, x28, x15\n\t" + "eor %[r], %[r], x15\n\t" "xar v9.2d, v22.2d, v27.2d, #3\n\t" "eor %x[seed], %x[seed], x19\n\t" "xar v22.2d, v14.2d, v29.2d, #25\n\t" - "eor x28, x28, x21\n\t" + "eor %[r], %[r], x21\n\t" "xar v14.2d, v20.2d, v25.2d, #46\n\t" "eor %x[seed], %x[seed], x24\n\t" "xar v20.2d, v2.2d, v27.2d, #2\n\t" - "eor x28, x28, x26\n\t" + "eor %[r], %[r], x26\n\t" "xar v2.2d, v12.2d, v27.2d, #21\n\t" "eor %x[state], %x[state], %x[seed], ror 63\n\t" "xar v12.2d, v13.2d, v28.2d, #39\n\t" - "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor %x[seed], %x[seed], %[r], ror 63\n\t" "xar v13.2d, v19.2d, v29.2d, #56\n\t" "eor x2, x2, %x[state]\n\t" "xar v19.2d, v23.2d, v28.2d, #8\n\t" @@ -9552,22 +9553,22 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "xar v18.2d, v17.2d, v27.2d, #49\n\t" "ldr %x[seed], [x29, #24]\n\t" "xar v17.2d, v11.2d, v26.2d, #54\n\t" - "eor x28, x28, x30, ror 63\n\t" + "eor %[r], %[r], x30, ror 63\n\t" "xar v11.2d, v7.2d, v27.2d, #58\n\t" "eor x30, x30, %x[seed], ror 63\n\t" "xar v7.2d, v10.2d, v25.2d, #61\n\t" "eor %x[seed], %x[seed], %x[state], ror 63\n\t" /* Row Mix */ "mov v25.16b, v0.16b\n\t" - "eor x6, x6, x28\n\t" + "eor x6, x6, %[r]\n\t" "mov v26.16b, v1.16b\n\t" - "eor x11, x11, x28\n\t" + "eor x11, x11, %[r]\n\t" "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" - "eor x16, x16, x28\n\t" + "eor x16, x16, %[r]\n\t" "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" - "eor x22, x22, x28\n\t" + "eor x22, x22, %[r]\n\t" "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" - "eor x27, x27, x28\n\t" + "eor x27, x27, %[r]\n\t" "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" "eor x3, x3, x30\n\t" "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" @@ -9635,57 +9636,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) /* Row Mix Base */ "bic x12, x4, x3\n\t" "bic %x[seed], x5, x4\n\t" - "bic x28, x2, x6\n\t" + "bic %[r], x2, x6\n\t" "bic x30, x3, x2\n\t" "eor x2, x2, x12\n\t" "eor x3, x3, %x[seed]\n\t" "bic x12, x6, x5\n\t" - "eor x5, x5, x28\n\t" + "eor x5, x5, %[r]\n\t" "eor x4, x4, x12\n\t" "eor x6, x6, x30\n\t" "bic x12, x9, x8\n\t" "bic %x[seed], x10, x9\n\t" - "bic x28, x7, x11\n\t" + "bic %[r], x7, x11\n\t" "bic x30, x8, x7\n\t" "eor x7, x7, x12\n\t" "eor x8, x8, %x[seed]\n\t" "bic x12, x11, x10\n\t" - "eor x10, x10, x28\n\t" + "eor x10, x10, %[r]\n\t" "eor x9, x9, x12\n\t" "eor x11, x11, x30\n\t" "bic x12, x14, x13\n\t" "bic %x[seed], x15, x14\n\t" - "bic x28, %x[state], x16\n\t" + "bic %[r], %x[state], x16\n\t" "bic x30, x13, %x[state]\n\t" "eor x12, %x[state], x12\n\t" "eor x13, x13, %x[seed]\n\t" "bic %x[state], x16, x15\n\t" - "eor x15, x15, x28\n\t" + "eor x15, x15, %[r]\n\t" "eor x14, x14, %x[state]\n\t" "eor x16, x16, x30\n\t" "bic %x[state], x20, x19\n\t" "bic %x[seed], x21, x20\n\t" - "bic x28, x17, x22\n\t" + "bic %[r], x17, x22\n\t" "bic x30, x19, x17\n\t" "eor x17, x17, %x[state]\n\t" "eor x19, x19, %x[seed]\n\t" "bic %x[state], x22, x21\n\t" - "eor x21, x21, x28\n\t" + "eor x21, x21, %[r]\n\t" "eor x20, x20, %x[state]\n\t" "eor x22, x22, x30\n\t" "bic %x[state], x25, x24\n\t" "bic %x[seed], x26, x25\n\t" - "bic x28, x23, x27\n\t" + "bic %[r], x23, x27\n\t" "bic x30, x24, x23\n\t" "eor x23, x23, %x[state]\n\t" "eor x24, x24, %x[seed]\n\t" "bic %x[state], x27, x26\n\t" - "eor x26, x26, x28\n\t" + "eor x26, x26, %[r]\n\t" "eor x25, x25, %x[state]\n\t" "eor x27, x27, x30\n\t" /* Done transforming */ - "ldp x28, %x[seed], [x29, #48]\n\t" - "ldr %x[state], [x28], #8\n\t" + "ldp %[r], %x[seed], [x29, #48]\n\t" + "ldr %x[state], [%[r]], #8\n\t" "subs %x[seed], %x[seed], #1\n\t" "mov v30.d[0], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t" @@ -9724,11 +9725,11 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) "str x27, [%x[state], #192]\n\t" "ldp x29, x30, [sp], #0x40\n\t" : [state] "+r" (state), [seed] "+r" (seed) - : + : [r] "r" (r) : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", - "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", - "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", + "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" @@ -9737,6 +9738,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed) void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) { + const word64* r = L_sha3_aarch64_r; __asm__ __volatile__ ( "stp x29, x30, [sp, #-64]!\n\t" "add x29, sp, #0\n\t" @@ -9796,57 +9798,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) /* Start of 24 rounds */ "\n" "L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t" - "stp x28, %x[seed], [x29, #48]\n\t" + "stp %[r], %x[seed], [x29, #48]\n\t" /* Col Mix */ "eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t" "eor %x[state], x6, x11\n\t" "eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t" "eor x30, x2, x7\n\t" "eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t" - "eor x28, x4, x9\n\t" + "eor %[r], x4, x9\n\t" "eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t" "eor %x[state], %x[state], x16\n\t" "eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t" "eor x30, x30, x12\n\t" "eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t" - "eor x28, x28, x14\n\t" + "eor %[r], %[r], x14\n\t" "eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t" "eor %x[state], %x[state], x22\n\t" "eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t" "eor x30, x30, x17\n\t" "eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t" - "eor x28, x28, x20\n\t" + "eor %[r], %[r], x20\n\t" "eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t" "eor %x[state], %x[state], x27\n\t" "rax1 v25.2d, v30.2d, v27.2d\n\t" "eor x30, x30, x23\n\t" "rax1 v26.2d, v31.2d, v28.2d\n\t" - "eor x28, x28, x25\n\t" + "eor %[r], %[r], x25\n\t" "rax1 v27.2d, v27.2d, v29.2d\n\t" "str %x[state], [x29, #32]\n\t" "rax1 v28.2d, v28.2d, v30.2d\n\t" - "str x28, [x29, #24]\n\t" + "str %[r], [x29, #24]\n\t" "rax1 v29.2d, v29.2d, v31.2d\n\t" "eor %x[seed], x3, x8\n\t" "eor v0.16b, v0.16b, v25.16b\n\t" "xar v30.2d, v1.2d, v26.2d, #63\n\t" - "eor x28, x5, x10\n\t" + "eor %[r], x5, x10\n\t" "xar v1.2d, v6.2d, v26.2d, #20\n\t" "eor %x[seed], %x[seed], x13\n\t" "xar v6.2d, v9.2d, v29.2d, #44\n\t" - "eor x28, x28, x15\n\t" + "eor %[r], %[r], x15\n\t" "xar v9.2d, v22.2d, v27.2d, #3\n\t" "eor %x[seed], %x[seed], x19\n\t" "xar v22.2d, v14.2d, v29.2d, #25\n\t" - "eor x28, x28, x21\n\t" + "eor %[r], %[r], x21\n\t" "xar v14.2d, v20.2d, v25.2d, #46\n\t" "eor %x[seed], %x[seed], x24\n\t" "xar v20.2d, v2.2d, v27.2d, #2\n\t" - "eor x28, x28, x26\n\t" + "eor %[r], %[r], x26\n\t" "xar v2.2d, v12.2d, v27.2d, #21\n\t" "eor %x[state], %x[state], %x[seed], ror 63\n\t" "xar v12.2d, v13.2d, v28.2d, #39\n\t" - "eor %x[seed], %x[seed], x28, ror 63\n\t" + "eor %x[seed], %x[seed], %[r], ror 63\n\t" "xar v13.2d, v19.2d, v29.2d, #56\n\t" "eor x2, x2, %x[state]\n\t" "xar v19.2d, v23.2d, v28.2d, #8\n\t" @@ -9872,22 +9874,22 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "xar v18.2d, v17.2d, v27.2d, #49\n\t" "ldr %x[seed], [x29, #24]\n\t" "xar v17.2d, v11.2d, v26.2d, #54\n\t" - "eor x28, x28, x30, ror 63\n\t" + "eor %[r], %[r], x30, ror 63\n\t" "xar v11.2d, v7.2d, v27.2d, #58\n\t" "eor x30, x30, %x[seed], ror 63\n\t" "xar v7.2d, v10.2d, v25.2d, #61\n\t" "eor %x[seed], %x[seed], %x[state], ror 63\n\t" /* Row Mix */ "mov v25.16b, v0.16b\n\t" - "eor x6, x6, x28\n\t" + "eor x6, x6, %[r]\n\t" "mov v26.16b, v1.16b\n\t" - "eor x11, x11, x28\n\t" + "eor x11, x11, %[r]\n\t" "bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t" - "eor x16, x16, x28\n\t" + "eor x16, x16, %[r]\n\t" "bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t" - "eor x22, x22, x28\n\t" + "eor x22, x22, %[r]\n\t" "bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t" - "eor x27, x27, x28\n\t" + "eor x27, x27, %[r]\n\t" "bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t" "eor x3, x3, x30\n\t" "bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t" @@ -9955,57 +9957,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) /* Row Mix Base */ "bic x12, x4, x3\n\t" "bic %x[seed], x5, x4\n\t" - "bic x28, x2, x6\n\t" + "bic %[r], x2, x6\n\t" "bic x30, x3, x2\n\t" "eor x2, x2, x12\n\t" "eor x3, x3, %x[seed]\n\t" "bic x12, x6, x5\n\t" - "eor x5, x5, x28\n\t" + "eor x5, x5, %[r]\n\t" "eor x4, x4, x12\n\t" "eor x6, x6, x30\n\t" "bic x12, x9, x8\n\t" "bic %x[seed], x10, x9\n\t" - "bic x28, x7, x11\n\t" + "bic %[r], x7, x11\n\t" "bic x30, x8, x7\n\t" "eor x7, x7, x12\n\t" "eor x8, x8, %x[seed]\n\t" "bic x12, x11, x10\n\t" - "eor x10, x10, x28\n\t" + "eor x10, x10, %[r]\n\t" "eor x9, x9, x12\n\t" "eor x11, x11, x30\n\t" "bic x12, x14, x13\n\t" "bic %x[seed], x15, x14\n\t" - "bic x28, %x[state], x16\n\t" + "bic %[r], %x[state], x16\n\t" "bic x30, x13, %x[state]\n\t" "eor x12, %x[state], x12\n\t" "eor x13, x13, %x[seed]\n\t" "bic %x[state], x16, x15\n\t" - "eor x15, x15, x28\n\t" + "eor x15, x15, %[r]\n\t" "eor x14, x14, %x[state]\n\t" "eor x16, x16, x30\n\t" "bic %x[state], x20, x19\n\t" "bic %x[seed], x21, x20\n\t" - "bic x28, x17, x22\n\t" + "bic %[r], x17, x22\n\t" "bic x30, x19, x17\n\t" "eor x17, x17, %x[state]\n\t" "eor x19, x19, %x[seed]\n\t" "bic %x[state], x22, x21\n\t" - "eor x21, x21, x28\n\t" + "eor x21, x21, %[r]\n\t" "eor x20, x20, %x[state]\n\t" "eor x22, x22, x30\n\t" "bic %x[state], x25, x24\n\t" "bic %x[seed], x26, x25\n\t" - "bic x28, x23, x27\n\t" + "bic %[r], x23, x27\n\t" "bic x30, x24, x23\n\t" "eor x23, x23, %x[state]\n\t" "eor x24, x24, %x[seed]\n\t" "bic %x[state], x27, x26\n\t" - "eor x26, x26, x28\n\t" + "eor x26, x26, %[r]\n\t" "eor x25, x25, %x[state]\n\t" "eor x27, x27, x30\n\t" /* Done transforming */ - "ldp x28, %x[seed], [x29, #48]\n\t" - "ldr %x[state], [x28], #8\n\t" + "ldp %[r], %x[seed], [x29, #48]\n\t" + "ldr %x[state], [%[r]], #8\n\t" "subs %x[seed], %x[seed], #1\n\t" "mov v30.d[0], %x[state]\n\t" "mov v30.d[1], %x[state]\n\t" @@ -10044,11 +10046,11 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed) "str x27, [%x[state], #192]\n\t" "ldp x29, x30, [sp], #0x40\n\t" : [state] "+r" (state), [seed] "+r" (seed) - : + : [r] "r" (r) : "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", - "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1", - "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", + "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"