ARMv8 32 bit ChaCha20 ASM: loading from in/out

Input and output buffers come from the application and are not
necessarily alighed.
Use instructions that allow unaligned access to these buffers.
This commit is contained in:
Sean Parkinson
2024-01-29 11:03:40 +10:00
parent 3db58af4f8
commit 4585c6d508

View File

@@ -1666,7 +1666,10 @@ static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS
"VADD.I32 q6, q6, q12 \n\t" "VADD.I32 q6, q6, q12 \n\t"
"VADD.I32 q7, q7, q13 \n\t" "VADD.I32 q7, q7, q13 \n\t"
"VLDM %[m], { q8-q15 } \n\t" "VLD1.8 { q8, q9 }, [%[m]]! \n\t"
"VLD1.8 { q10, q11 }, [%[m]]! \n\t"
"VLD1.8 { q12, q13 }, [%[m]]! \n\t"
"VLD1.8 { q14, q15 }, [%[m]]! \n\t"
"VEOR q0, q0, q8 \n\t" "VEOR q0, q0, q8 \n\t"
"VEOR q1, q1, q9 \n\t" "VEOR q1, q1, q9 \n\t"
"VEOR q2, q2, q10 \n\t" "VEOR q2, q2, q10 \n\t"
@@ -1675,7 +1678,10 @@ static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS
"VEOR q5, q5, q13 \n\t" "VEOR q5, q5, q13 \n\t"
"VEOR q6, q6, q14 \n\t" "VEOR q6, q6, q14 \n\t"
"VEOR q7, q7, q15 \n\t" "VEOR q7, q7, q15 \n\t"
"VSTM %[c], { q0-q7 } \n\t" "VST1.8 { q0, q1 }, [%[c]]! \n\t"
"VST1.8 { q2, q3 }, [%[c]]! \n\t"
"VST1.8 { q4, q5 }, [%[c]]! \n\t"
"VST1.8 { q6, q7 }, [%[c]]! \n\t"
: [c] "+r" (c), [m] "+r" (m) : [c] "+r" (c), [m] "+r" (m)
: [rounds] "I" (ROUNDS/2), [input] "r" (input), : [rounds] "I" (ROUNDS/2), [input] "r" (input),
@@ -2725,14 +2731,14 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
"CMP %[bytes], #64 \n\t" "CMP %[bytes], #64 \n\t"
"BLT L_chacha20_arm32_64_lt_64_%= \n\t" "BLT L_chacha20_arm32_64_lt_64_%= \n\t"
/* XOR full 64 byte block */ /* XOR full 64 byte block */
"VLDM %[m], { q4-q7 } \n\t" "VLD1.8 { q4, q5 }, [%[m]]! \n\t"
"ADD %[m], %[m], #64 \n\t" "VLD1.8 { q6, q7 }, [%[m]]! \n\t"
"VEOR q0, q0, q4 \n\t" "VEOR q0, q0, q4 \n\t"
"VEOR q1, q1, q5 \n\t" "VEOR q1, q1, q5 \n\t"
"VEOR q2, q2, q6 \n\t" "VEOR q2, q2, q6 \n\t"
"VEOR q3, q3, q7 \n\t" "VEOR q3, q3, q7 \n\t"
"VSTM %[c], { q0-q3 } \n\t" "VST1.8 { q0, q1 }, [%[c]]! \n\t"
"ADD %[c], %[c], #64 \n\t" "VST1.8 { q2, q3 }, [%[c]]! \n\t"
"SUBS %[bytes], %[bytes], #64 \n\t" "SUBS %[bytes], %[bytes], #64 \n\t"
"VADD.I32 q11, q11, q14 \n\t" "VADD.I32 q11, q11, q14 \n\t"
"BNE L_chacha20_arm32_64_outer_loop_%= \n\t" "BNE L_chacha20_arm32_64_outer_loop_%= \n\t"
@@ -2743,12 +2749,10 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
/* XOR 32 bytes */ /* XOR 32 bytes */
"CMP %[bytes], #32 \n\t" "CMP %[bytes], #32 \n\t"
"BLT L_chacha20_arm32_64_lt_32_%= \n\t" "BLT L_chacha20_arm32_64_lt_32_%= \n\t"
"VLDM %[m], { q4-q5 } \n\t" "VLD1.8 { q4, q5 }, [%[m]]! \n\t"
"ADD %[m], %[m], #32 \n\t"
"VEOR q4, q4, q0 \n\t" "VEOR q4, q4, q0 \n\t"
"VEOR q5, q5, q1 \n\t" "VEOR q5, q5, q1 \n\t"
"VSTM %[c], { q4-q5 } \n\t" "VST1.8 { q4, q5 }, [%[c]]! \n\t"
"ADD %[c], %[c], #32 \n\t"
"SUBS %[bytes], %[bytes], #32 \n\t" "SUBS %[bytes], %[bytes], #32 \n\t"
"VMOV q0, q2 \n\t" "VMOV q0, q2 \n\t"
"VMOV q1, q3 \n\t" "VMOV q1, q3 \n\t"
@@ -2758,11 +2762,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
/* XOR 16 bytes */ /* XOR 16 bytes */
"CMP %[bytes], #16 \n\t" "CMP %[bytes], #16 \n\t"
"BLT L_chacha20_arm32_64_lt_16_%= \n\t" "BLT L_chacha20_arm32_64_lt_16_%= \n\t"
"VLDM %[m], { q4 } \n\t" "VLD1.8 { q4 }, [%[m]]! \n\t"
"ADD %[m], %[m], #16 \n\t"
"VEOR q4, q4, q0 \n\t" "VEOR q4, q4, q0 \n\t"
"VSTM %[c], { q4 } \n\t" "VST1.8 { q4 }, [%[c]]! \n\t"
"ADD %[c], %[c], #16 \n\t"
"SUBS %[bytes], %[bytes], #16 \n\t" "SUBS %[bytes], %[bytes], #16 \n\t"
"VMOV q0, q1 \n\t" "VMOV q0, q1 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t"
@@ -2771,9 +2773,9 @@ static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
/* XOR 8 bytes */ /* XOR 8 bytes */
"CMP %[bytes], #8 \n\t" "CMP %[bytes], #8 \n\t"
"BLT L_chacha20_arm32_64_lt_8_%= \n\t" "BLT L_chacha20_arm32_64_lt_8_%= \n\t"
"VLD1.64 { d8 }, [%[m]]! \n\t" "VLD1.8 { d8 }, [%[m]]! \n\t"
"VEOR d8, d8, d0 \n\t" "VEOR d8, d8, d0 \n\t"
"VST1.64 { d8 }, [%[c]]! \n\t" "VST1.8 { d8 }, [%[c]]! \n\t"
"SUBS %[bytes], %[bytes], #8 \n\t" "SUBS %[bytes], %[bytes], #8 \n\t"
"VMOV d0, d1 \n\t" "VMOV d0, d1 \n\t"
"BEQ L_chacha20_arm32_64_done_%= \n\t" "BEQ L_chacha20_arm32_64_done_%= \n\t"