ML-KEM SHA-3: fix r

Constant r wasn't being loaded into register in all assembly functions
that use it - it just got lucky most of the time.
This commit is contained in:
Sean Parkinson
2025-10-28 12:04:12 +10:00
parent 070923a373
commit d883a950d2
2 changed files with 80 additions and 64 deletions

View File

@@ -10074,6 +10074,13 @@ _mlkem_shake128_blocksx3_seed_neon:
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]
@@ -10414,6 +10421,13 @@ _mlkem_shake256_blocksx3_seed_neon:
stp d10, d11, [x29, #176]
stp d12, d13, [x29, #192]
stp d14, d15, [x29, #208]
#ifndef __APPLE__
adrp x28, L_sha3_aarch64_r
add x28, x28, :lo12:L_sha3_aarch64_r
#else
adrp x28, L_sha3_aarch64_r@PAGE
add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF
#endif /* __APPLE__ */
str x0, [x29, #40]
add x0, x0, #32
ld1 {v4.d}[0], [x0]

View File

@@ -9417,6 +9417,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
{
const word64* r = L_sha3_aarch64_r;
__asm__ __volatile__ (
"stp x29, x30, [sp, #-64]!\n\t"
"add x29, sp, #0\n\t"
@@ -9476,57 +9477,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
/* Start of 24 rounds */
"\n"
"L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t"
"stp x28, %x[seed], [x29, #48]\n\t"
"stp %[r], %x[seed], [x29, #48]\n\t"
/* Col Mix */
"eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t"
"eor %x[state], x6, x11\n\t"
"eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t"
"eor x30, x2, x7\n\t"
"eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t"
"eor x28, x4, x9\n\t"
"eor %[r], x4, x9\n\t"
"eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t"
"eor %x[state], %x[state], x16\n\t"
"eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t"
"eor x30, x30, x12\n\t"
"eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t"
"eor x28, x28, x14\n\t"
"eor %[r], %[r], x14\n\t"
"eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t"
"eor %x[state], %x[state], x22\n\t"
"eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t"
"eor x30, x30, x17\n\t"
"eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t"
"eor x28, x28, x20\n\t"
"eor %[r], %[r], x20\n\t"
"eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t"
"eor %x[state], %x[state], x27\n\t"
"rax1 v25.2d, v30.2d, v27.2d\n\t"
"eor x30, x30, x23\n\t"
"rax1 v26.2d, v31.2d, v28.2d\n\t"
"eor x28, x28, x25\n\t"
"eor %[r], %[r], x25\n\t"
"rax1 v27.2d, v27.2d, v29.2d\n\t"
"str %x[state], [x29, #32]\n\t"
"rax1 v28.2d, v28.2d, v30.2d\n\t"
"str x28, [x29, #24]\n\t"
"str %[r], [x29, #24]\n\t"
"rax1 v29.2d, v29.2d, v31.2d\n\t"
"eor %x[seed], x3, x8\n\t"
"eor v0.16b, v0.16b, v25.16b\n\t"
"xar v30.2d, v1.2d, v26.2d, #63\n\t"
"eor x28, x5, x10\n\t"
"eor %[r], x5, x10\n\t"
"xar v1.2d, v6.2d, v26.2d, #20\n\t"
"eor %x[seed], %x[seed], x13\n\t"
"xar v6.2d, v9.2d, v29.2d, #44\n\t"
"eor x28, x28, x15\n\t"
"eor %[r], %[r], x15\n\t"
"xar v9.2d, v22.2d, v27.2d, #3\n\t"
"eor %x[seed], %x[seed], x19\n\t"
"xar v22.2d, v14.2d, v29.2d, #25\n\t"
"eor x28, x28, x21\n\t"
"eor %[r], %[r], x21\n\t"
"xar v14.2d, v20.2d, v25.2d, #46\n\t"
"eor %x[seed], %x[seed], x24\n\t"
"xar v20.2d, v2.2d, v27.2d, #2\n\t"
"eor x28, x28, x26\n\t"
"eor %[r], %[r], x26\n\t"
"xar v2.2d, v12.2d, v27.2d, #21\n\t"
"eor %x[state], %x[state], %x[seed], ror 63\n\t"
"xar v12.2d, v13.2d, v28.2d, #39\n\t"
"eor %x[seed], %x[seed], x28, ror 63\n\t"
"eor %x[seed], %x[seed], %[r], ror 63\n\t"
"xar v13.2d, v19.2d, v29.2d, #56\n\t"
"eor x2, x2, %x[state]\n\t"
"xar v19.2d, v23.2d, v28.2d, #8\n\t"
@@ -9552,22 +9553,22 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"xar v18.2d, v17.2d, v27.2d, #49\n\t"
"ldr %x[seed], [x29, #24]\n\t"
"xar v17.2d, v11.2d, v26.2d, #54\n\t"
"eor x28, x28, x30, ror 63\n\t"
"eor %[r], %[r], x30, ror 63\n\t"
"xar v11.2d, v7.2d, v27.2d, #58\n\t"
"eor x30, x30, %x[seed], ror 63\n\t"
"xar v7.2d, v10.2d, v25.2d, #61\n\t"
"eor %x[seed], %x[seed], %x[state], ror 63\n\t"
/* Row Mix */
"mov v25.16b, v0.16b\n\t"
"eor x6, x6, x28\n\t"
"eor x6, x6, %[r]\n\t"
"mov v26.16b, v1.16b\n\t"
"eor x11, x11, x28\n\t"
"eor x11, x11, %[r]\n\t"
"bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t"
"eor x16, x16, x28\n\t"
"eor x16, x16, %[r]\n\t"
"bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t"
"eor x22, x22, x28\n\t"
"eor x22, x22, %[r]\n\t"
"bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t"
"eor x27, x27, x28\n\t"
"eor x27, x27, %[r]\n\t"
"bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t"
"eor x3, x3, x30\n\t"
"bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t"
@@ -9635,57 +9636,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
/* Row Mix Base */
"bic x12, x4, x3\n\t"
"bic %x[seed], x5, x4\n\t"
"bic x28, x2, x6\n\t"
"bic %[r], x2, x6\n\t"
"bic x30, x3, x2\n\t"
"eor x2, x2, x12\n\t"
"eor x3, x3, %x[seed]\n\t"
"bic x12, x6, x5\n\t"
"eor x5, x5, x28\n\t"
"eor x5, x5, %[r]\n\t"
"eor x4, x4, x12\n\t"
"eor x6, x6, x30\n\t"
"bic x12, x9, x8\n\t"
"bic %x[seed], x10, x9\n\t"
"bic x28, x7, x11\n\t"
"bic %[r], x7, x11\n\t"
"bic x30, x8, x7\n\t"
"eor x7, x7, x12\n\t"
"eor x8, x8, %x[seed]\n\t"
"bic x12, x11, x10\n\t"
"eor x10, x10, x28\n\t"
"eor x10, x10, %[r]\n\t"
"eor x9, x9, x12\n\t"
"eor x11, x11, x30\n\t"
"bic x12, x14, x13\n\t"
"bic %x[seed], x15, x14\n\t"
"bic x28, %x[state], x16\n\t"
"bic %[r], %x[state], x16\n\t"
"bic x30, x13, %x[state]\n\t"
"eor x12, %x[state], x12\n\t"
"eor x13, x13, %x[seed]\n\t"
"bic %x[state], x16, x15\n\t"
"eor x15, x15, x28\n\t"
"eor x15, x15, %[r]\n\t"
"eor x14, x14, %x[state]\n\t"
"eor x16, x16, x30\n\t"
"bic %x[state], x20, x19\n\t"
"bic %x[seed], x21, x20\n\t"
"bic x28, x17, x22\n\t"
"bic %[r], x17, x22\n\t"
"bic x30, x19, x17\n\t"
"eor x17, x17, %x[state]\n\t"
"eor x19, x19, %x[seed]\n\t"
"bic %x[state], x22, x21\n\t"
"eor x21, x21, x28\n\t"
"eor x21, x21, %[r]\n\t"
"eor x20, x20, %x[state]\n\t"
"eor x22, x22, x30\n\t"
"bic %x[state], x25, x24\n\t"
"bic %x[seed], x26, x25\n\t"
"bic x28, x23, x27\n\t"
"bic %[r], x23, x27\n\t"
"bic x30, x24, x23\n\t"
"eor x23, x23, %x[state]\n\t"
"eor x24, x24, %x[seed]\n\t"
"bic %x[state], x27, x26\n\t"
"eor x26, x26, x28\n\t"
"eor x26, x26, %[r]\n\t"
"eor x25, x25, %x[state]\n\t"
"eor x27, x27, x30\n\t"
/* Done transforming */
"ldp x28, %x[seed], [x29, #48]\n\t"
"ldr %x[state], [x28], #8\n\t"
"ldp %[r], %x[seed], [x29, #48]\n\t"
"ldr %x[state], [%[r]], #8\n\t"
"subs %x[seed], %x[seed], #1\n\t"
"mov v30.d[0], %x[state]\n\t"
"mov v30.d[1], %x[state]\n\t"
@@ -9724,11 +9725,11 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
:
: [r] "r" (r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2",
"v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31"
@@ -9737,6 +9738,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
{
const word64* r = L_sha3_aarch64_r;
__asm__ __volatile__ (
"stp x29, x30, [sp, #-64]!\n\t"
"add x29, sp, #0\n\t"
@@ -9796,57 +9798,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
/* Start of 24 rounds */
"\n"
"L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t"
"stp x28, %x[seed], [x29, #48]\n\t"
"stp %[r], %x[seed], [x29, #48]\n\t"
/* Col Mix */
"eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t"
"eor %x[state], x6, x11\n\t"
"eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t"
"eor x30, x2, x7\n\t"
"eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t"
"eor x28, x4, x9\n\t"
"eor %[r], x4, x9\n\t"
"eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t"
"eor %x[state], %x[state], x16\n\t"
"eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t"
"eor x30, x30, x12\n\t"
"eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t"
"eor x28, x28, x14\n\t"
"eor %[r], %[r], x14\n\t"
"eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t"
"eor %x[state], %x[state], x22\n\t"
"eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t"
"eor x30, x30, x17\n\t"
"eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t"
"eor x28, x28, x20\n\t"
"eor %[r], %[r], x20\n\t"
"eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t"
"eor %x[state], %x[state], x27\n\t"
"rax1 v25.2d, v30.2d, v27.2d\n\t"
"eor x30, x30, x23\n\t"
"rax1 v26.2d, v31.2d, v28.2d\n\t"
"eor x28, x28, x25\n\t"
"eor %[r], %[r], x25\n\t"
"rax1 v27.2d, v27.2d, v29.2d\n\t"
"str %x[state], [x29, #32]\n\t"
"rax1 v28.2d, v28.2d, v30.2d\n\t"
"str x28, [x29, #24]\n\t"
"str %[r], [x29, #24]\n\t"
"rax1 v29.2d, v29.2d, v31.2d\n\t"
"eor %x[seed], x3, x8\n\t"
"eor v0.16b, v0.16b, v25.16b\n\t"
"xar v30.2d, v1.2d, v26.2d, #63\n\t"
"eor x28, x5, x10\n\t"
"eor %[r], x5, x10\n\t"
"xar v1.2d, v6.2d, v26.2d, #20\n\t"
"eor %x[seed], %x[seed], x13\n\t"
"xar v6.2d, v9.2d, v29.2d, #44\n\t"
"eor x28, x28, x15\n\t"
"eor %[r], %[r], x15\n\t"
"xar v9.2d, v22.2d, v27.2d, #3\n\t"
"eor %x[seed], %x[seed], x19\n\t"
"xar v22.2d, v14.2d, v29.2d, #25\n\t"
"eor x28, x28, x21\n\t"
"eor %[r], %[r], x21\n\t"
"xar v14.2d, v20.2d, v25.2d, #46\n\t"
"eor %x[seed], %x[seed], x24\n\t"
"xar v20.2d, v2.2d, v27.2d, #2\n\t"
"eor x28, x28, x26\n\t"
"eor %[r], %[r], x26\n\t"
"xar v2.2d, v12.2d, v27.2d, #21\n\t"
"eor %x[state], %x[state], %x[seed], ror 63\n\t"
"xar v12.2d, v13.2d, v28.2d, #39\n\t"
"eor %x[seed], %x[seed], x28, ror 63\n\t"
"eor %x[seed], %x[seed], %[r], ror 63\n\t"
"xar v13.2d, v19.2d, v29.2d, #56\n\t"
"eor x2, x2, %x[state]\n\t"
"xar v19.2d, v23.2d, v28.2d, #8\n\t"
@@ -9872,22 +9874,22 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"xar v18.2d, v17.2d, v27.2d, #49\n\t"
"ldr %x[seed], [x29, #24]\n\t"
"xar v17.2d, v11.2d, v26.2d, #54\n\t"
"eor x28, x28, x30, ror 63\n\t"
"eor %[r], %[r], x30, ror 63\n\t"
"xar v11.2d, v7.2d, v27.2d, #58\n\t"
"eor x30, x30, %x[seed], ror 63\n\t"
"xar v7.2d, v10.2d, v25.2d, #61\n\t"
"eor %x[seed], %x[seed], %x[state], ror 63\n\t"
/* Row Mix */
"mov v25.16b, v0.16b\n\t"
"eor x6, x6, x28\n\t"
"eor x6, x6, %[r]\n\t"
"mov v26.16b, v1.16b\n\t"
"eor x11, x11, x28\n\t"
"eor x11, x11, %[r]\n\t"
"bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t"
"eor x16, x16, x28\n\t"
"eor x16, x16, %[r]\n\t"
"bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t"
"eor x22, x22, x28\n\t"
"eor x22, x22, %[r]\n\t"
"bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t"
"eor x27, x27, x28\n\t"
"eor x27, x27, %[r]\n\t"
"bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t"
"eor x3, x3, x30\n\t"
"bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t"
@@ -9955,57 +9957,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
/* Row Mix Base */
"bic x12, x4, x3\n\t"
"bic %x[seed], x5, x4\n\t"
"bic x28, x2, x6\n\t"
"bic %[r], x2, x6\n\t"
"bic x30, x3, x2\n\t"
"eor x2, x2, x12\n\t"
"eor x3, x3, %x[seed]\n\t"
"bic x12, x6, x5\n\t"
"eor x5, x5, x28\n\t"
"eor x5, x5, %[r]\n\t"
"eor x4, x4, x12\n\t"
"eor x6, x6, x30\n\t"
"bic x12, x9, x8\n\t"
"bic %x[seed], x10, x9\n\t"
"bic x28, x7, x11\n\t"
"bic %[r], x7, x11\n\t"
"bic x30, x8, x7\n\t"
"eor x7, x7, x12\n\t"
"eor x8, x8, %x[seed]\n\t"
"bic x12, x11, x10\n\t"
"eor x10, x10, x28\n\t"
"eor x10, x10, %[r]\n\t"
"eor x9, x9, x12\n\t"
"eor x11, x11, x30\n\t"
"bic x12, x14, x13\n\t"
"bic %x[seed], x15, x14\n\t"
"bic x28, %x[state], x16\n\t"
"bic %[r], %x[state], x16\n\t"
"bic x30, x13, %x[state]\n\t"
"eor x12, %x[state], x12\n\t"
"eor x13, x13, %x[seed]\n\t"
"bic %x[state], x16, x15\n\t"
"eor x15, x15, x28\n\t"
"eor x15, x15, %[r]\n\t"
"eor x14, x14, %x[state]\n\t"
"eor x16, x16, x30\n\t"
"bic %x[state], x20, x19\n\t"
"bic %x[seed], x21, x20\n\t"
"bic x28, x17, x22\n\t"
"bic %[r], x17, x22\n\t"
"bic x30, x19, x17\n\t"
"eor x17, x17, %x[state]\n\t"
"eor x19, x19, %x[seed]\n\t"
"bic %x[state], x22, x21\n\t"
"eor x21, x21, x28\n\t"
"eor x21, x21, %[r]\n\t"
"eor x20, x20, %x[state]\n\t"
"eor x22, x22, x30\n\t"
"bic %x[state], x25, x24\n\t"
"bic %x[seed], x26, x25\n\t"
"bic x28, x23, x27\n\t"
"bic %[r], x23, x27\n\t"
"bic x30, x24, x23\n\t"
"eor x23, x23, %x[state]\n\t"
"eor x24, x24, %x[seed]\n\t"
"bic %x[state], x27, x26\n\t"
"eor x26, x26, x28\n\t"
"eor x26, x26, %[r]\n\t"
"eor x25, x25, %x[state]\n\t"
"eor x27, x27, x30\n\t"
/* Done transforming */
"ldp x28, %x[seed], [x29, #48]\n\t"
"ldr %x[state], [x28], #8\n\t"
"ldp %[r], %x[seed], [x29, #48]\n\t"
"ldr %x[state], [%[r]], #8\n\t"
"subs %x[seed], %x[seed], #1\n\t"
"mov v30.d[0], %x[state]\n\t"
"mov v30.d[1], %x[state]\n\t"
@@ -10044,11 +10046,11 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
"str x27, [%x[state], #192]\n\t"
"ldp x29, x30, [sp], #0x40\n\t"
: [state] "+r" (state), [seed] "+r" (seed)
:
: [r] "r" (r)
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2",
"v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31"