From 4585c6d508b872a08778a380a691a1429997cdf3 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Mon, 29 Jan 2024 11:03:40 +1000 Subject: [PATCH] ARMv8 32 bit ChaCha20 ASM: loading from in/out Input and output buffers come from the application and are not necessarily alighed. Use instructions that allow unaligned access to these buffers. --- wolfcrypt/src/port/arm/armv8-chacha.c | 34 ++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 94e645049..18dd9e596 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -1666,7 +1666,10 @@ static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS "VADD.I32 q6, q6, q12 \n\t" "VADD.I32 q7, q7, q13 \n\t" - "VLDM %[m], { q8-q15 } \n\t" + "VLD1.8 { q8, q9 }, [%[m]]! \n\t" + "VLD1.8 { q10, q11 }, [%[m]]! \n\t" + "VLD1.8 { q12, q13 }, [%[m]]! \n\t" + "VLD1.8 { q14, q15 }, [%[m]]! \n\t" "VEOR q0, q0, q8 \n\t" "VEOR q1, q1, q9 \n\t" "VEOR q2, q2, q10 \n\t" @@ -1675,7 +1678,10 @@ static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS "VEOR q5, q5, q13 \n\t" "VEOR q6, q6, q14 \n\t" "VEOR q7, q7, q15 \n\t" - "VSTM %[c], { q0-q7 } \n\t" + "VST1.8 { q0, q1 }, [%[c]]! \n\t" + "VST1.8 { q2, q3 }, [%[c]]! \n\t" + "VST1.8 { q4, q5 }, [%[c]]! \n\t" + "VST1.8 { q6, q7 }, [%[c]]! \n\t" : [c] "+r" (c), [m] "+r" (m) : [rounds] "I" (ROUNDS/2), [input] "r" (input), @@ -2725,14 +2731,14 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, "CMP %[bytes], #64 \n\t" "BLT L_chacha20_arm32_64_lt_64_%= \n\t" /* XOR full 64 byte block */ - "VLDM %[m], { q4-q7 } \n\t" - "ADD %[m], %[m], #64 \n\t" + "VLD1.8 { q4, q5 }, [%[m]]! \n\t" + "VLD1.8 { q6, q7 }, [%[m]]! \n\t" "VEOR q0, q0, q4 \n\t" "VEOR q1, q1, q5 \n\t" "VEOR q2, q2, q6 \n\t" "VEOR q3, q3, q7 \n\t" - "VSTM %[c], { q0-q3 } \n\t" - "ADD %[c], %[c], #64 \n\t" + "VST1.8 { q0, q1 }, [%[c]]! \n\t" + "VST1.8 { q2, q3 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #64 \n\t" "VADD.I32 q11, q11, q14 \n\t" "BNE L_chacha20_arm32_64_outer_loop_%= \n\t" @@ -2743,12 +2749,10 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, /* XOR 32 bytes */ "CMP %[bytes], #32 \n\t" "BLT L_chacha20_arm32_64_lt_32_%= \n\t" - "VLDM %[m], { q4-q5 } \n\t" - "ADD %[m], %[m], #32 \n\t" + "VLD1.8 { q4, q5 }, [%[m]]! \n\t" "VEOR q4, q4, q0 \n\t" "VEOR q5, q5, q1 \n\t" - "VSTM %[c], { q4-q5 } \n\t" - "ADD %[c], %[c], #32 \n\t" + "VST1.8 { q4, q5 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #32 \n\t" "VMOV q0, q2 \n\t" "VMOV q1, q3 \n\t" @@ -2758,11 +2762,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, /* XOR 16 bytes */ "CMP %[bytes], #16 \n\t" "BLT L_chacha20_arm32_64_lt_16_%= \n\t" - "VLDM %[m], { q4 } \n\t" - "ADD %[m], %[m], #16 \n\t" + "VLD1.8 { q4 }, [%[m]]! \n\t" "VEOR q4, q4, q0 \n\t" - "VSTM %[c], { q4 } \n\t" - "ADD %[c], %[c], #16 \n\t" + "VST1.8 { q4 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #16 \n\t" "VMOV q0, q1 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t" @@ -2771,9 +2773,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, /* XOR 8 bytes */ "CMP %[bytes], #8 \n\t" "BLT L_chacha20_arm32_64_lt_8_%= \n\t" - "VLD1.64 { d8 }, [%[m]]! \n\t" + "VLD1.8 { d8 }, [%[m]]! \n\t" "VEOR d8, d8, d0 \n\t" - "VST1.64 { d8 }, [%[c]]! \n\t" + "VST1.8 { d8 }, [%[c]]! \n\t" "SUBS %[bytes], %[bytes], #8 \n\t" "VMOV d0, d1 \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t"