mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-01-26 21:22:19 +01:00
Merge pull request #9355 from SparkiDev/aes_arm_asm_fix
AES ARM ASM: user data loaded 1 reg at a time
This commit is contained in:
@@ -8596,17 +8596,10 @@ AES_set_encrypt_key:
|
||||
beq L_AES_set_encrypt_key_start_128
|
||||
cmp r1, #0xc0
|
||||
beq L_AES_set_encrypt_key_start_192
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldm r0, {r4, r5}
|
||||
#else
|
||||
ldrd r4, r5, [r0]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r0, #8]
|
||||
ldr r7, [r0, #12]
|
||||
#else
|
||||
ldrd r6, r7, [r0, #8]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# REV r4, r4
|
||||
eor r3, r4, r4, ror #16
|
||||
@@ -8635,18 +8628,10 @@ AES_set_encrypt_key:
|
||||
rev r7, r7
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
stm r2!, {r4, r5, r6, r7}
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r4, [r0, #16]
|
||||
ldr r5, [r0, #20]
|
||||
#else
|
||||
ldrd r4, r5, [r0, #16]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r6, [r0, #24]
|
||||
ldr r7, [r0, #28]
|
||||
#else
|
||||
ldrd r6, r7, [r0, #24]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# REV r4, r4
|
||||
eor r3, r4, r4, ror #16
|
||||
@@ -8825,23 +8810,12 @@ L_AES_set_encrypt_key_loop_256:
|
||||
sub r2, r2, #16
|
||||
b L_AES_set_encrypt_key_end
|
||||
L_AES_set_encrypt_key_start_192:
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldm r0, {r4, r5}
|
||||
#else
|
||||
ldrd r4, r5, [r0]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r0, #8]
|
||||
ldr r7, [r0, #12]
|
||||
#else
|
||||
ldrd r6, r7, [r0, #8]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r1, [r0, #20]
|
||||
ldr r0, [r0, #16]
|
||||
#else
|
||||
ldrd r0, r1, [r0, #16]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# REV r4, r4
|
||||
eor r3, r4, r4, ror #16
|
||||
@@ -8989,17 +8963,10 @@ L_AES_set_encrypt_key_loop_192:
|
||||
stm r2, {r0, r1, r4, r5}
|
||||
b L_AES_set_encrypt_key_end
|
||||
L_AES_set_encrypt_key_start_128:
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldm r0, {r4, r5}
|
||||
#else
|
||||
ldrd r4, r5, [r0]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r0, #8]
|
||||
ldr r7, [r0, #12]
|
||||
#else
|
||||
ldrd r6, r7, [r0, #8]
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
# REV r4, r4
|
||||
eor r3, r4, r4, ror #16
|
||||
|
||||
@@ -8876,17 +8876,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
|
||||
"beq L_AES_set_encrypt_key_start_128_%=\n\t"
|
||||
"cmp %[len], #0xc0\n\t"
|
||||
"beq L_AES_set_encrypt_key_start_192_%=\n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldm r0, {r4, r5}\n\t"
|
||||
#else
|
||||
"ldrd r4, r5, [%[key]]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr r4, [%[key]]\n\t"
|
||||
"ldr r5, [%[key], #4]\n\t"
|
||||
"ldr r6, [%[key], #8]\n\t"
|
||||
"ldr r7, [%[key], #12]\n\t"
|
||||
#else
|
||||
"ldrd r6, r7, [%[key], #8]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* REV r4, r4 */
|
||||
"eor r3, r4, r4, ror #16\n\t"
|
||||
@@ -8915,18 +8908,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
|
||||
"rev r7, r7\n\t"
|
||||
#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */
|
||||
"stm %[ks]!, {r4, r5, r6, r7}\n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr r4, [%[key], #16]\n\t"
|
||||
"ldr r5, [%[key], #20]\n\t"
|
||||
#else
|
||||
"ldrd r4, r5, [%[key], #16]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr r6, [%[key], #24]\n\t"
|
||||
"ldr r7, [%[key], #28]\n\t"
|
||||
#else
|
||||
"ldrd r6, r7, [%[key], #24]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* REV r4, r4 */
|
||||
"eor r3, r4, r4, ror #16\n\t"
|
||||
@@ -9107,23 +9092,12 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
|
||||
"b L_AES_set_encrypt_key_end_%=\n\t"
|
||||
"\n"
|
||||
"L_AES_set_encrypt_key_start_192_%=: \n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldm r0, {r4, r5}\n\t"
|
||||
#else
|
||||
"ldrd r4, r5, [%[key]]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr r4, [%[key]]\n\t"
|
||||
"ldr r5, [%[key], #4]\n\t"
|
||||
"ldr r6, [%[key], #8]\n\t"
|
||||
"ldr r7, [%[key], #12]\n\t"
|
||||
#else
|
||||
"ldrd r6, r7, [%[key], #8]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr %[len], [%[key], #20]\n\t"
|
||||
"ldr %[key], [%[key], #16]\n\t"
|
||||
#else
|
||||
"ldrd %[key], %[len], [%[key], #16]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* REV r4, r4 */
|
||||
"eor r3, r4, r4, ror #16\n\t"
|
||||
@@ -9273,17 +9247,10 @@ WC_OMIT_FRAME_POINTER void AES_set_encrypt_key(const unsigned char* key,
|
||||
"b L_AES_set_encrypt_key_end_%=\n\t"
|
||||
"\n"
|
||||
"L_AES_set_encrypt_key_start_128_%=: \n\t"
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldm r0, {r4, r5}\n\t"
|
||||
#else
|
||||
"ldrd r4, r5, [%[key]]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 7)
|
||||
"ldr r4, [%[key]]\n\t"
|
||||
"ldr r5, [%[key], #4]\n\t"
|
||||
"ldr r6, [%[key], #8]\n\t"
|
||||
"ldr r7, [%[key], #12]\n\t"
|
||||
#else
|
||||
"ldrd r6, r7, [%[key], #8]\n\t"
|
||||
#endif
|
||||
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6)
|
||||
/* REV r4, r4 */
|
||||
"eor r3, r4, r4, ror #16\n\t"
|
||||
|
||||
@@ -46,8 +46,9 @@ _AES_set_key_AARCH64:
|
||||
cmp x1, #24
|
||||
blt L_aes_set_key_arm64_crypto_start_128
|
||||
bgt L_aes_set_key_arm64_crypto_start_256
|
||||
ldp x4, x6, [x0], #16
|
||||
ldr x8, [x0]
|
||||
ldr x4, [x0], #8
|
||||
ldr x6, [x0], #8
|
||||
ldr x8, [x0], #8
|
||||
stp x4, x6, [x2], #16
|
||||
str x8, [x2], #8
|
||||
lsr x5, x4, #32
|
||||
@@ -212,8 +213,10 @@ _AES_set_key_AARCH64:
|
||||
stur q0, [x2, #96]
|
||||
b L_aes_set_key_arm64_crypto_done
|
||||
L_aes_set_key_arm64_crypto_start_256:
|
||||
ldp x4, x6, [x0], #16
|
||||
ldp x8, x10, [x0], #16
|
||||
ldr x4, [x0], #8
|
||||
ldr x6, [x0], #8
|
||||
ldr x8, [x0], #8
|
||||
ldr x10, [x0], #8
|
||||
stp x4, x6, [x2], #16
|
||||
stp x8, x10, [x2], #16
|
||||
lsr x5, x4, #32
|
||||
@@ -412,7 +415,8 @@ L_aes_set_key_arm64_crypto_start_256:
|
||||
stur q0, [x2, #112]
|
||||
b L_aes_set_key_arm64_crypto_done
|
||||
L_aes_set_key_arm64_crypto_start_128:
|
||||
ldp x4, x6, [x0], #16
|
||||
ldr x4, [x0], #8
|
||||
ldr x6, [x0], #8
|
||||
stp x4, x6, [x2], #16
|
||||
lsr x5, x4, #32
|
||||
lsr x7, x6, #32
|
||||
|
||||
@@ -40,8 +40,9 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
|
||||
"cmp %x[keylen], #24\n\t"
|
||||
"b.lt L_aes_set_key_arm64_crypto_start_128_%=\n\t"
|
||||
"b.gt L_aes_set_key_arm64_crypto_start_256_%=\n\t"
|
||||
"ldp x4, x6, [%x[userKey]], #16\n\t"
|
||||
"ldr x8, [%x[userKey]]\n\t"
|
||||
"ldr x4, [%x[userKey]], #8\n\t"
|
||||
"ldr x6, [%x[userKey]], #8\n\t"
|
||||
"ldr x8, [%x[userKey]], #8\n\t"
|
||||
"stp x4, x6, [%x[key]], #16\n\t"
|
||||
"str x8, [%x[key]], #8\n\t"
|
||||
"lsr x5, x4, #32\n\t"
|
||||
@@ -207,8 +208,10 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
|
||||
"b L_aes_set_key_arm64_crypto_done_%=\n\t"
|
||||
"\n"
|
||||
"L_aes_set_key_arm64_crypto_start_256_%=: \n\t"
|
||||
"ldp x4, x6, [%x[userKey]], #16\n\t"
|
||||
"ldp x8, x10, [%x[userKey]], #16\n\t"
|
||||
"ldr x4, [%x[userKey]], #8\n\t"
|
||||
"ldr x6, [%x[userKey]], #8\n\t"
|
||||
"ldr x8, [%x[userKey]], #8\n\t"
|
||||
"ldr x10, [%x[userKey]], #8\n\t"
|
||||
"stp x4, x6, [%x[key]], #16\n\t"
|
||||
"stp x8, x10, [%x[key]], #16\n\t"
|
||||
"lsr x5, x4, #32\n\t"
|
||||
@@ -408,7 +411,8 @@ void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir)
|
||||
"b L_aes_set_key_arm64_crypto_done_%=\n\t"
|
||||
"\n"
|
||||
"L_aes_set_key_arm64_crypto_start_128_%=: \n\t"
|
||||
"ldp x4, x6, [%x[userKey]], #16\n\t"
|
||||
"ldr x4, [%x[userKey]], #8\n\t"
|
||||
"ldr x6, [%x[userKey]], #8\n\t"
|
||||
"stp x4, x6, [%x[key]], #16\n\t"
|
||||
"lsr x5, x4, #32\n\t"
|
||||
"lsr x7, x6, #32\n\t"
|
||||
|
||||
@@ -10074,6 +10074,13 @@ _mlkem_shake128_blocksx3_seed_neon:
|
||||
stp d10, d11, [x29, #176]
|
||||
stp d12, d13, [x29, #192]
|
||||
stp d14, d15, [x29, #208]
|
||||
#ifndef __APPLE__
|
||||
adrp x28, L_sha3_aarch64_r
|
||||
add x28, x28, :lo12:L_sha3_aarch64_r
|
||||
#else
|
||||
adrp x28, L_sha3_aarch64_r@PAGE
|
||||
add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF
|
||||
#endif /* __APPLE__ */
|
||||
str x0, [x29, #40]
|
||||
add x0, x0, #32
|
||||
ld1 {v4.d}[0], [x0]
|
||||
@@ -10414,6 +10421,13 @@ _mlkem_shake256_blocksx3_seed_neon:
|
||||
stp d10, d11, [x29, #176]
|
||||
stp d12, d13, [x29, #192]
|
||||
stp d14, d15, [x29, #208]
|
||||
#ifndef __APPLE__
|
||||
adrp x28, L_sha3_aarch64_r
|
||||
add x28, x28, :lo12:L_sha3_aarch64_r
|
||||
#else
|
||||
adrp x28, L_sha3_aarch64_r@PAGE
|
||||
add x28, x28, :lo12:L_sha3_aarch64_r@PAGEOFF
|
||||
#endif /* __APPLE__ */
|
||||
str x0, [x29, #40]
|
||||
add x0, x0, #32
|
||||
ld1 {v4.d}[0], [x0]
|
||||
|
||||
@@ -9417,6 +9417,7 @@ void mlkem_sha3_blocksx3_neon(word64* state)
|
||||
|
||||
void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
{
|
||||
const word64* r = L_sha3_aarch64_r;
|
||||
__asm__ __volatile__ (
|
||||
"stp x29, x30, [sp, #-64]!\n\t"
|
||||
"add x29, sp, #0\n\t"
|
||||
@@ -9476,57 +9477,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
/* Start of 24 rounds */
|
||||
"\n"
|
||||
"L_SHA3_shake128_blocksx3_seed_neon_begin_%=: \n\t"
|
||||
"stp x28, %x[seed], [x29, #48]\n\t"
|
||||
"stp %[r], %x[seed], [x29, #48]\n\t"
|
||||
/* Col Mix */
|
||||
"eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t"
|
||||
"eor %x[state], x6, x11\n\t"
|
||||
"eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t"
|
||||
"eor x30, x2, x7\n\t"
|
||||
"eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t"
|
||||
"eor x28, x4, x9\n\t"
|
||||
"eor %[r], x4, x9\n\t"
|
||||
"eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t"
|
||||
"eor %x[state], %x[state], x16\n\t"
|
||||
"eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t"
|
||||
"eor x30, x30, x12\n\t"
|
||||
"eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t"
|
||||
"eor x28, x28, x14\n\t"
|
||||
"eor %[r], %[r], x14\n\t"
|
||||
"eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t"
|
||||
"eor %x[state], %x[state], x22\n\t"
|
||||
"eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t"
|
||||
"eor x30, x30, x17\n\t"
|
||||
"eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t"
|
||||
"eor x28, x28, x20\n\t"
|
||||
"eor %[r], %[r], x20\n\t"
|
||||
"eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t"
|
||||
"eor %x[state], %x[state], x27\n\t"
|
||||
"rax1 v25.2d, v30.2d, v27.2d\n\t"
|
||||
"eor x30, x30, x23\n\t"
|
||||
"rax1 v26.2d, v31.2d, v28.2d\n\t"
|
||||
"eor x28, x28, x25\n\t"
|
||||
"eor %[r], %[r], x25\n\t"
|
||||
"rax1 v27.2d, v27.2d, v29.2d\n\t"
|
||||
"str %x[state], [x29, #32]\n\t"
|
||||
"rax1 v28.2d, v28.2d, v30.2d\n\t"
|
||||
"str x28, [x29, #24]\n\t"
|
||||
"str %[r], [x29, #24]\n\t"
|
||||
"rax1 v29.2d, v29.2d, v31.2d\n\t"
|
||||
"eor %x[seed], x3, x8\n\t"
|
||||
"eor v0.16b, v0.16b, v25.16b\n\t"
|
||||
"xar v30.2d, v1.2d, v26.2d, #63\n\t"
|
||||
"eor x28, x5, x10\n\t"
|
||||
"eor %[r], x5, x10\n\t"
|
||||
"xar v1.2d, v6.2d, v26.2d, #20\n\t"
|
||||
"eor %x[seed], %x[seed], x13\n\t"
|
||||
"xar v6.2d, v9.2d, v29.2d, #44\n\t"
|
||||
"eor x28, x28, x15\n\t"
|
||||
"eor %[r], %[r], x15\n\t"
|
||||
"xar v9.2d, v22.2d, v27.2d, #3\n\t"
|
||||
"eor %x[seed], %x[seed], x19\n\t"
|
||||
"xar v22.2d, v14.2d, v29.2d, #25\n\t"
|
||||
"eor x28, x28, x21\n\t"
|
||||
"eor %[r], %[r], x21\n\t"
|
||||
"xar v14.2d, v20.2d, v25.2d, #46\n\t"
|
||||
"eor %x[seed], %x[seed], x24\n\t"
|
||||
"xar v20.2d, v2.2d, v27.2d, #2\n\t"
|
||||
"eor x28, x28, x26\n\t"
|
||||
"eor %[r], %[r], x26\n\t"
|
||||
"xar v2.2d, v12.2d, v27.2d, #21\n\t"
|
||||
"eor %x[state], %x[state], %x[seed], ror 63\n\t"
|
||||
"xar v12.2d, v13.2d, v28.2d, #39\n\t"
|
||||
"eor %x[seed], %x[seed], x28, ror 63\n\t"
|
||||
"eor %x[seed], %x[seed], %[r], ror 63\n\t"
|
||||
"xar v13.2d, v19.2d, v29.2d, #56\n\t"
|
||||
"eor x2, x2, %x[state]\n\t"
|
||||
"xar v19.2d, v23.2d, v28.2d, #8\n\t"
|
||||
@@ -9552,22 +9553,22 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
"xar v18.2d, v17.2d, v27.2d, #49\n\t"
|
||||
"ldr %x[seed], [x29, #24]\n\t"
|
||||
"xar v17.2d, v11.2d, v26.2d, #54\n\t"
|
||||
"eor x28, x28, x30, ror 63\n\t"
|
||||
"eor %[r], %[r], x30, ror 63\n\t"
|
||||
"xar v11.2d, v7.2d, v27.2d, #58\n\t"
|
||||
"eor x30, x30, %x[seed], ror 63\n\t"
|
||||
"xar v7.2d, v10.2d, v25.2d, #61\n\t"
|
||||
"eor %x[seed], %x[seed], %x[state], ror 63\n\t"
|
||||
/* Row Mix */
|
||||
"mov v25.16b, v0.16b\n\t"
|
||||
"eor x6, x6, x28\n\t"
|
||||
"eor x6, x6, %[r]\n\t"
|
||||
"mov v26.16b, v1.16b\n\t"
|
||||
"eor x11, x11, x28\n\t"
|
||||
"eor x11, x11, %[r]\n\t"
|
||||
"bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t"
|
||||
"eor x16, x16, x28\n\t"
|
||||
"eor x16, x16, %[r]\n\t"
|
||||
"bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t"
|
||||
"eor x22, x22, x28\n\t"
|
||||
"eor x22, x22, %[r]\n\t"
|
||||
"bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t"
|
||||
"eor x27, x27, x28\n\t"
|
||||
"eor x27, x27, %[r]\n\t"
|
||||
"bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t"
|
||||
"eor x3, x3, x30\n\t"
|
||||
"bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t"
|
||||
@@ -9635,57 +9636,57 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
/* Row Mix Base */
|
||||
"bic x12, x4, x3\n\t"
|
||||
"bic %x[seed], x5, x4\n\t"
|
||||
"bic x28, x2, x6\n\t"
|
||||
"bic %[r], x2, x6\n\t"
|
||||
"bic x30, x3, x2\n\t"
|
||||
"eor x2, x2, x12\n\t"
|
||||
"eor x3, x3, %x[seed]\n\t"
|
||||
"bic x12, x6, x5\n\t"
|
||||
"eor x5, x5, x28\n\t"
|
||||
"eor x5, x5, %[r]\n\t"
|
||||
"eor x4, x4, x12\n\t"
|
||||
"eor x6, x6, x30\n\t"
|
||||
"bic x12, x9, x8\n\t"
|
||||
"bic %x[seed], x10, x9\n\t"
|
||||
"bic x28, x7, x11\n\t"
|
||||
"bic %[r], x7, x11\n\t"
|
||||
"bic x30, x8, x7\n\t"
|
||||
"eor x7, x7, x12\n\t"
|
||||
"eor x8, x8, %x[seed]\n\t"
|
||||
"bic x12, x11, x10\n\t"
|
||||
"eor x10, x10, x28\n\t"
|
||||
"eor x10, x10, %[r]\n\t"
|
||||
"eor x9, x9, x12\n\t"
|
||||
"eor x11, x11, x30\n\t"
|
||||
"bic x12, x14, x13\n\t"
|
||||
"bic %x[seed], x15, x14\n\t"
|
||||
"bic x28, %x[state], x16\n\t"
|
||||
"bic %[r], %x[state], x16\n\t"
|
||||
"bic x30, x13, %x[state]\n\t"
|
||||
"eor x12, %x[state], x12\n\t"
|
||||
"eor x13, x13, %x[seed]\n\t"
|
||||
"bic %x[state], x16, x15\n\t"
|
||||
"eor x15, x15, x28\n\t"
|
||||
"eor x15, x15, %[r]\n\t"
|
||||
"eor x14, x14, %x[state]\n\t"
|
||||
"eor x16, x16, x30\n\t"
|
||||
"bic %x[state], x20, x19\n\t"
|
||||
"bic %x[seed], x21, x20\n\t"
|
||||
"bic x28, x17, x22\n\t"
|
||||
"bic %[r], x17, x22\n\t"
|
||||
"bic x30, x19, x17\n\t"
|
||||
"eor x17, x17, %x[state]\n\t"
|
||||
"eor x19, x19, %x[seed]\n\t"
|
||||
"bic %x[state], x22, x21\n\t"
|
||||
"eor x21, x21, x28\n\t"
|
||||
"eor x21, x21, %[r]\n\t"
|
||||
"eor x20, x20, %x[state]\n\t"
|
||||
"eor x22, x22, x30\n\t"
|
||||
"bic %x[state], x25, x24\n\t"
|
||||
"bic %x[seed], x26, x25\n\t"
|
||||
"bic x28, x23, x27\n\t"
|
||||
"bic %[r], x23, x27\n\t"
|
||||
"bic x30, x24, x23\n\t"
|
||||
"eor x23, x23, %x[state]\n\t"
|
||||
"eor x24, x24, %x[seed]\n\t"
|
||||
"bic %x[state], x27, x26\n\t"
|
||||
"eor x26, x26, x28\n\t"
|
||||
"eor x26, x26, %[r]\n\t"
|
||||
"eor x25, x25, %x[state]\n\t"
|
||||
"eor x27, x27, x30\n\t"
|
||||
/* Done transforming */
|
||||
"ldp x28, %x[seed], [x29, #48]\n\t"
|
||||
"ldr %x[state], [x28], #8\n\t"
|
||||
"ldp %[r], %x[seed], [x29, #48]\n\t"
|
||||
"ldr %x[state], [%[r]], #8\n\t"
|
||||
"subs %x[seed], %x[seed], #1\n\t"
|
||||
"mov v30.d[0], %x[state]\n\t"
|
||||
"mov v30.d[1], %x[state]\n\t"
|
||||
@@ -9724,11 +9725,11 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
"str x27, [%x[state], #192]\n\t"
|
||||
"ldp x29, x30, [sp], #0x40\n\t"
|
||||
: [state] "+r" (state), [seed] "+r" (seed)
|
||||
:
|
||||
: [r] "r" (r)
|
||||
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
|
||||
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
|
||||
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
|
||||
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
|
||||
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2",
|
||||
"v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
|
||||
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31"
|
||||
@@ -9737,6 +9738,7 @@ void mlkem_shake128_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
|
||||
void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
{
|
||||
const word64* r = L_sha3_aarch64_r;
|
||||
__asm__ __volatile__ (
|
||||
"stp x29, x30, [sp, #-64]!\n\t"
|
||||
"add x29, sp, #0\n\t"
|
||||
@@ -9796,57 +9798,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
/* Start of 24 rounds */
|
||||
"\n"
|
||||
"L_SHA3_shake256_blocksx3_seed_neon_begin_%=: \n\t"
|
||||
"stp x28, %x[seed], [x29, #48]\n\t"
|
||||
"stp %[r], %x[seed], [x29, #48]\n\t"
|
||||
/* Col Mix */
|
||||
"eor3 v31.16b, v0.16b, v5.16b, v10.16b\n\t"
|
||||
"eor %x[state], x6, x11\n\t"
|
||||
"eor3 v27.16b, v1.16b, v6.16b, v11.16b\n\t"
|
||||
"eor x30, x2, x7\n\t"
|
||||
"eor3 v28.16b, v2.16b, v7.16b, v12.16b\n\t"
|
||||
"eor x28, x4, x9\n\t"
|
||||
"eor %[r], x4, x9\n\t"
|
||||
"eor3 v29.16b, v3.16b, v8.16b, v13.16b\n\t"
|
||||
"eor %x[state], %x[state], x16\n\t"
|
||||
"eor3 v30.16b, v4.16b, v9.16b, v14.16b\n\t"
|
||||
"eor x30, x30, x12\n\t"
|
||||
"eor3 v31.16b, v31.16b, v15.16b, v20.16b\n\t"
|
||||
"eor x28, x28, x14\n\t"
|
||||
"eor %[r], %[r], x14\n\t"
|
||||
"eor3 v27.16b, v27.16b, v16.16b, v21.16b\n\t"
|
||||
"eor %x[state], %x[state], x22\n\t"
|
||||
"eor3 v28.16b, v28.16b, v17.16b, v22.16b\n\t"
|
||||
"eor x30, x30, x17\n\t"
|
||||
"eor3 v29.16b, v29.16b, v18.16b, v23.16b\n\t"
|
||||
"eor x28, x28, x20\n\t"
|
||||
"eor %[r], %[r], x20\n\t"
|
||||
"eor3 v30.16b, v30.16b, v19.16b, v24.16b\n\t"
|
||||
"eor %x[state], %x[state], x27\n\t"
|
||||
"rax1 v25.2d, v30.2d, v27.2d\n\t"
|
||||
"eor x30, x30, x23\n\t"
|
||||
"rax1 v26.2d, v31.2d, v28.2d\n\t"
|
||||
"eor x28, x28, x25\n\t"
|
||||
"eor %[r], %[r], x25\n\t"
|
||||
"rax1 v27.2d, v27.2d, v29.2d\n\t"
|
||||
"str %x[state], [x29, #32]\n\t"
|
||||
"rax1 v28.2d, v28.2d, v30.2d\n\t"
|
||||
"str x28, [x29, #24]\n\t"
|
||||
"str %[r], [x29, #24]\n\t"
|
||||
"rax1 v29.2d, v29.2d, v31.2d\n\t"
|
||||
"eor %x[seed], x3, x8\n\t"
|
||||
"eor v0.16b, v0.16b, v25.16b\n\t"
|
||||
"xar v30.2d, v1.2d, v26.2d, #63\n\t"
|
||||
"eor x28, x5, x10\n\t"
|
||||
"eor %[r], x5, x10\n\t"
|
||||
"xar v1.2d, v6.2d, v26.2d, #20\n\t"
|
||||
"eor %x[seed], %x[seed], x13\n\t"
|
||||
"xar v6.2d, v9.2d, v29.2d, #44\n\t"
|
||||
"eor x28, x28, x15\n\t"
|
||||
"eor %[r], %[r], x15\n\t"
|
||||
"xar v9.2d, v22.2d, v27.2d, #3\n\t"
|
||||
"eor %x[seed], %x[seed], x19\n\t"
|
||||
"xar v22.2d, v14.2d, v29.2d, #25\n\t"
|
||||
"eor x28, x28, x21\n\t"
|
||||
"eor %[r], %[r], x21\n\t"
|
||||
"xar v14.2d, v20.2d, v25.2d, #46\n\t"
|
||||
"eor %x[seed], %x[seed], x24\n\t"
|
||||
"xar v20.2d, v2.2d, v27.2d, #2\n\t"
|
||||
"eor x28, x28, x26\n\t"
|
||||
"eor %[r], %[r], x26\n\t"
|
||||
"xar v2.2d, v12.2d, v27.2d, #21\n\t"
|
||||
"eor %x[state], %x[state], %x[seed], ror 63\n\t"
|
||||
"xar v12.2d, v13.2d, v28.2d, #39\n\t"
|
||||
"eor %x[seed], %x[seed], x28, ror 63\n\t"
|
||||
"eor %x[seed], %x[seed], %[r], ror 63\n\t"
|
||||
"xar v13.2d, v19.2d, v29.2d, #56\n\t"
|
||||
"eor x2, x2, %x[state]\n\t"
|
||||
"xar v19.2d, v23.2d, v28.2d, #8\n\t"
|
||||
@@ -9872,22 +9874,22 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
"xar v18.2d, v17.2d, v27.2d, #49\n\t"
|
||||
"ldr %x[seed], [x29, #24]\n\t"
|
||||
"xar v17.2d, v11.2d, v26.2d, #54\n\t"
|
||||
"eor x28, x28, x30, ror 63\n\t"
|
||||
"eor %[r], %[r], x30, ror 63\n\t"
|
||||
"xar v11.2d, v7.2d, v27.2d, #58\n\t"
|
||||
"eor x30, x30, %x[seed], ror 63\n\t"
|
||||
"xar v7.2d, v10.2d, v25.2d, #61\n\t"
|
||||
"eor %x[seed], %x[seed], %x[state], ror 63\n\t"
|
||||
/* Row Mix */
|
||||
"mov v25.16b, v0.16b\n\t"
|
||||
"eor x6, x6, x28\n\t"
|
||||
"eor x6, x6, %[r]\n\t"
|
||||
"mov v26.16b, v1.16b\n\t"
|
||||
"eor x11, x11, x28\n\t"
|
||||
"eor x11, x11, %[r]\n\t"
|
||||
"bcax v0.16b, v25.16b, v2.16b, v26.16b\n\t"
|
||||
"eor x16, x16, x28\n\t"
|
||||
"eor x16, x16, %[r]\n\t"
|
||||
"bcax v1.16b, v26.16b, v3.16b, v2.16b\n\t"
|
||||
"eor x22, x22, x28\n\t"
|
||||
"eor x22, x22, %[r]\n\t"
|
||||
"bcax v2.16b, v2.16b, v4.16b, v3.16b\n\t"
|
||||
"eor x27, x27, x28\n\t"
|
||||
"eor x27, x27, %[r]\n\t"
|
||||
"bcax v3.16b, v3.16b, v25.16b, v4.16b\n\t"
|
||||
"eor x3, x3, x30\n\t"
|
||||
"bcax v4.16b, v4.16b, v26.16b, v25.16b\n\t"
|
||||
@@ -9955,57 +9957,57 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
/* Row Mix Base */
|
||||
"bic x12, x4, x3\n\t"
|
||||
"bic %x[seed], x5, x4\n\t"
|
||||
"bic x28, x2, x6\n\t"
|
||||
"bic %[r], x2, x6\n\t"
|
||||
"bic x30, x3, x2\n\t"
|
||||
"eor x2, x2, x12\n\t"
|
||||
"eor x3, x3, %x[seed]\n\t"
|
||||
"bic x12, x6, x5\n\t"
|
||||
"eor x5, x5, x28\n\t"
|
||||
"eor x5, x5, %[r]\n\t"
|
||||
"eor x4, x4, x12\n\t"
|
||||
"eor x6, x6, x30\n\t"
|
||||
"bic x12, x9, x8\n\t"
|
||||
"bic %x[seed], x10, x9\n\t"
|
||||
"bic x28, x7, x11\n\t"
|
||||
"bic %[r], x7, x11\n\t"
|
||||
"bic x30, x8, x7\n\t"
|
||||
"eor x7, x7, x12\n\t"
|
||||
"eor x8, x8, %x[seed]\n\t"
|
||||
"bic x12, x11, x10\n\t"
|
||||
"eor x10, x10, x28\n\t"
|
||||
"eor x10, x10, %[r]\n\t"
|
||||
"eor x9, x9, x12\n\t"
|
||||
"eor x11, x11, x30\n\t"
|
||||
"bic x12, x14, x13\n\t"
|
||||
"bic %x[seed], x15, x14\n\t"
|
||||
"bic x28, %x[state], x16\n\t"
|
||||
"bic %[r], %x[state], x16\n\t"
|
||||
"bic x30, x13, %x[state]\n\t"
|
||||
"eor x12, %x[state], x12\n\t"
|
||||
"eor x13, x13, %x[seed]\n\t"
|
||||
"bic %x[state], x16, x15\n\t"
|
||||
"eor x15, x15, x28\n\t"
|
||||
"eor x15, x15, %[r]\n\t"
|
||||
"eor x14, x14, %x[state]\n\t"
|
||||
"eor x16, x16, x30\n\t"
|
||||
"bic %x[state], x20, x19\n\t"
|
||||
"bic %x[seed], x21, x20\n\t"
|
||||
"bic x28, x17, x22\n\t"
|
||||
"bic %[r], x17, x22\n\t"
|
||||
"bic x30, x19, x17\n\t"
|
||||
"eor x17, x17, %x[state]\n\t"
|
||||
"eor x19, x19, %x[seed]\n\t"
|
||||
"bic %x[state], x22, x21\n\t"
|
||||
"eor x21, x21, x28\n\t"
|
||||
"eor x21, x21, %[r]\n\t"
|
||||
"eor x20, x20, %x[state]\n\t"
|
||||
"eor x22, x22, x30\n\t"
|
||||
"bic %x[state], x25, x24\n\t"
|
||||
"bic %x[seed], x26, x25\n\t"
|
||||
"bic x28, x23, x27\n\t"
|
||||
"bic %[r], x23, x27\n\t"
|
||||
"bic x30, x24, x23\n\t"
|
||||
"eor x23, x23, %x[state]\n\t"
|
||||
"eor x24, x24, %x[seed]\n\t"
|
||||
"bic %x[state], x27, x26\n\t"
|
||||
"eor x26, x26, x28\n\t"
|
||||
"eor x26, x26, %[r]\n\t"
|
||||
"eor x25, x25, %x[state]\n\t"
|
||||
"eor x27, x27, x30\n\t"
|
||||
/* Done transforming */
|
||||
"ldp x28, %x[seed], [x29, #48]\n\t"
|
||||
"ldr %x[state], [x28], #8\n\t"
|
||||
"ldp %[r], %x[seed], [x29, #48]\n\t"
|
||||
"ldr %x[state], [%[r]], #8\n\t"
|
||||
"subs %x[seed], %x[seed], #1\n\t"
|
||||
"mov v30.d[0], %x[state]\n\t"
|
||||
"mov v30.d[1], %x[state]\n\t"
|
||||
@@ -10044,11 +10046,11 @@ void mlkem_shake256_blocksx3_seed_neon(word64* state, byte* seed)
|
||||
"str x27, [%x[state], #192]\n\t"
|
||||
"ldp x29, x30, [sp], #0x40\n\t"
|
||||
: [state] "+r" (state), [seed] "+r" (seed)
|
||||
:
|
||||
: [r] "r" (r)
|
||||
: "memory", "cc", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
|
||||
"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20",
|
||||
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "v0", "v1",
|
||||
"v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
|
||||
"x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2",
|
||||
"v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
|
||||
"v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"v31"
|
||||
|
||||
Reference in New Issue
Block a user