ChaCha ARM 32-bit: get debug working

r7 is needed for debug.
Needed to use stack explicitly rather than let compiler use r7.
This commit is contained in:
Sean Parkinson
2022-09-16 11:29:30 +10:00
parent 2c4c7ba6da
commit 7062ed0a2c

View File

@ -968,14 +968,16 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"v21", "v22", "v23" "v21", "v22", "v23"
); );
#else #else
word32 x[CHACHA_CHUNK_WORDS];
word32* x_addr = x;
__asm__ __volatile__ ( __asm__ __volatile__ (
// The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
".align 2 \n\t" ".align 2 \n\t"
"LDR r14, %[input] \n\t" // load input address "LDR r14, %[input] \n\t" // load input address
#ifndef NDEBUG
"PUSH { r7 } \n\t"
#endif
"SUB sp, sp, #16*4 \n\t"
"LDM r14, { r0-r12 } \n\t" "LDM r14, { r0-r12 } \n\t"
// r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
@ -985,7 +987,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"VMOV d2, r4, r5 \n\t" "VMOV d2, r4, r5 \n\t"
"VMOV d3, r6, r7 \n\t" "VMOV d3, r6, r7 \n\t"
"VMOV d4, r8, r9 \n\t" "VMOV d4, r8, r9 \n\t"
"STRD r10, r11, %[x_10] \n\t" "STRD r10, r11, [sp, #4*10] \n\t"
"VMOV d5, r10, r11 \n\t" "VMOV d5, r10, r11 \n\t"
#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 8) #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 8)
"LDR r11, [r14, #4*14] \n\t" "LDR r11, [r14, #4*14] \n\t"
@ -1002,7 +1004,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
// r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
// 0 1 2 3 4 5 6 7 8 9 15 14 12 // 0 1 2 3 4 5 6 7 8 9 15 14 12
"VMOV d7, r11, r10 \n\t" "VMOV d7, r11, r10 \n\t"
"STR r10, %[x_15] \n\t" "STR r10, [sp, #4*15] \n\t"
"VMOV d15, r11, r10 \n\t" "VMOV d15, r11, r10 \n\t"
"VMOV d23, r11, r10 \n\t" "VMOV d23, r11, r10 \n\t"
"MOV r10, r12 \n\t" "MOV r10, r12 \n\t"
@ -1070,22 +1072,22 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"VSRI.I32 q1, q12, #20 \n\t" "VSRI.I32 q1, q12, #20 \n\t"
"ADD r9, r9, r11 \n\t" // 9 9 13 "ADD r9, r9, r11 \n\t" // 9 9 13
"VSRI.I32 q5, q13, #20 \n\t" "VSRI.I32 q5, q13, #20 \n\t"
"STR r11, %[x_13] \n\t" "STR r11, [sp, #4*13] \n\t"
"VSRI.I32 q9, q14, #20 \n\t" "VSRI.I32 q9, q14, #20 \n\t"
"LDR r11, %[x_15] \n\t" "LDR r11, [sp, #4*15] \n\t"
"VADD.I32 q0, q0, q1 \n\t" "VADD.I32 q0, q0, q1 \n\t"
"EOR r4, r4, r8 \n\t" // 4 4 8 "EOR r4, r4, r8 \n\t" // 4 4 8
"VADD.I32 q4, q4, q5 \n\t" "VADD.I32 q4, q4, q5 \n\t"
"STR r8, %[x_8] \n\t" "STR r8, [sp, #4*8] \n\t"
"VADD.I32 q8, q8, q9 \n\t" "VADD.I32 q8, q8, q9 \n\t"
"LDR r8, %[x_10] \n\t" "LDR r8, [sp, #4*10] \n\t"
"VEOR q12, q3, q0 \n\t" "VEOR q12, q3, q0 \n\t"
"EOR r5, r5, r9 \n\t" // 5 5 9 "EOR r5, r5, r9 \n\t" // 5 5 9
"VEOR q13, q7, q4 \n\t" "VEOR q13, q7, q4 \n\t"
"STR r9, %[x_9] \n\t" "STR r9, [sp, #4*9] \n\t"
"VEOR q14, q11, q8 \n\t" "VEOR q14, q11, q8 \n\t"
"LDR r9, %[x_11] \n\t" "LDR r9, [sp, #4*11] \n\t"
// SIMD instructions don't support rotation so we have to cheat using shifts and a help register // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
"VSHL.I32 q3, q12, #8 \n\t" "VSHL.I32 q3, q12, #8 \n\t"
"ROR r4, r4, #25 \n\t" // 4 4 "ROR r4, r4, #25 \n\t" // 4 4
@ -1199,24 +1201,24 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"VSHL.I32 q9, q14, #12 \n\t" "VSHL.I32 q9, q14, #12 \n\t"
"ADD r8, r8, r11 \n\t" // 10 10 15 "ADD r8, r8, r11 \n\t" // 10 10 15
"VSRI.I32 q1, q12, #20 \n\t" "VSRI.I32 q1, q12, #20 \n\t"
"STR r11, %[x_15] \n\t" "STR r11, [sp, #4*15] \n\t"
"VSRI.I32 q5, q13, #20 \n\t" "VSRI.I32 q5, q13, #20 \n\t"
"LDR r11, %[x_13] \n\t" "LDR r11, [sp, #4*13] \n\t"
"VSRI.I32 q9, q14, #20 \n\t" "VSRI.I32 q9, q14, #20 \n\t"
"ADD r9, r9, r10 \n\t" // 11 11 12 "ADD r9, r9, r10 \n\t" // 11 11 12
"VADD.I32 q0, q0, q1 \n\t" "VADD.I32 q0, q0, q1 \n\t"
"EOR r5, r5, r8 \n\t" // 5 5 10 "EOR r5, r5, r8 \n\t" // 5 5 10
"VADD.I32 q4, q4, q5 \n\t" "VADD.I32 q4, q4, q5 \n\t"
"STR r8, %[x_10] \n\t" "STR r8, [sp, #4*10] \n\t"
"VADD.I32 q8, q8, q9 \n\t" "VADD.I32 q8, q8, q9 \n\t"
"LDR r8, %[x_8] \n\t" "LDR r8, [sp, #4*8] \n\t"
"VEOR q12, q3, q0 \n\t" "VEOR q12, q3, q0 \n\t"
"EOR r6, r6, r9 \n\t" // 6 6 11 "EOR r6, r6, r9 \n\t" // 6 6 11
"VEOR q13, q7, q4 \n\t" "VEOR q13, q7, q4 \n\t"
"STR r9, %[x_11] \n\t" "STR r9, [sp, #4*11] \n\t"
"VEOR q14, q11, q8 \n\t" "VEOR q14, q11, q8 \n\t"
"LDR r9, %[x_9] \n\t" "LDR r9, [sp, #4*9] \n\t"
// SIMD instructions don't support rotation so we have to cheat using shifts and a help register // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
"VSHL.I32 q3, q12, #8 \n\t" "VSHL.I32 q3, q12, #8 \n\t"
"ROR r5, r5, #25 \n\t" // 5 5 "ROR r5, r5, #25 \n\t" // 5 5
@ -1286,18 +1288,26 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"BNE L_chacha20_arm32_256_loop_%= \n\t" "BNE L_chacha20_arm32_256_loop_%= \n\t"
"LDR r14, %[x_addr] \n\t" // load address of x to r14
// r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
// 0 1 2 3 4 5 6 7 8 9 12 13 14 // 0 1 2 3 4 5 6 7 8 9 12 13 14
"ADD r10, r10, #3 \n\t" // add three here to make later NEON easier "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier
"STM r14, { r0-r9 } \n\t" "STM sp, { r0-r9 } \n\t"
"STRD r10, r11, [r14, #4*12] \n\t" "STRD r10, r11, [sp, #4*12] \n\t"
"STR r12, [sp, #4*14] \n\t"
"ADD sp, sp, #16*4 \n\t"
#ifndef NDEBUG
"POP { r7 } \n\t"
#endif
"LDR r9, %[input] \n\t" // load input address "LDR r9, %[input] \n\t" // load input address
"STR r12, [r14, #4*14] \n\t"
"LDR r10, %[c] \n\t" // load c address "LDR r10, %[c] \n\t" // load c address
"VLDM r9, { q12-q15 } \n\t" "VLDM r9, { q12-q15 } \n\t"
"LDR r12, %[m] \n\t" // load m address "LDR r12, %[m] \n\t" // load m address
#ifndef NDEBUG
"SUB sp, sp, #17*4 \n\t"
#else
"SUB sp, sp, #16*4 \n\t"
#endif
"VADD.I32 q0, q0, q12 \n\t" "VADD.I32 q0, q0, q12 \n\t"
"VADD.I32 q1, q1, q13 \n\t" "VADD.I32 q1, q1, q13 \n\t"
@ -1329,7 +1339,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"VEOR q3, q3, q15 \n\t" "VEOR q3, q3, q15 \n\t"
"VSTM r10!, { q0-q3 } \n\t" // store to c "VSTM r10!, { q0-q3 } \n\t" // store to c
"VLDM r14, { q0-q3 } \n\t " // load final block from x "VLDM sp, { q0-q3 } \n\t " // load final block from x
"VLDM r12!, { q12-q15 } \n\t" // load m "VLDM r12!, { q12-q15 } \n\t" // load m
"VEOR q4, q4, q12 \n\t" "VEOR q4, q4, q12 \n\t"
@ -1358,20 +1368,21 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"VEOR q3, q3, q15 \n\t" "VEOR q3, q3, q15 \n\t"
"VSTM r10!, { q0-q3 } \n\t" // store to c "VSTM r10!, { q0-q3 } \n\t" // store to c
: [c] "+m" (c), #ifndef NDEBUG
[x_0] "=m" (x), "ADD sp, sp, #17*4 \n\t"
[x_8] "=m" (x[8]), #else
[x_9] "=m" (x[9]), "ADD sp, sp, #16*4 \n\t"
[x_10] "=m" (x[10]), #endif
[x_11] "=m" (x[11]), : [c] "+m" (c)
[x_13] "=m" (x[13]),
[x_15] "=m" (x[15])
: [rounds] "I" (ROUNDS/2), [input] "m" (input), : [rounds] "I" (ROUNDS/2), [input] "m" (input),
[chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES), [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES),
[m] "m" (m), [x_addr] "m" (x_addr) [m] "m" (m)
: "memory", "cc", : "memory", "cc",
"r0", "r1", "r2", "r3", "r0", "r1", "r2", "r3",
"r4", "r5", "r6", "r7", "r4", "r5", "r6",
#ifdef NDEBUG
"r7",
#endif
"r8", "r9", "r10", "r11", "r12", "r14", "r8", "r9", "r10", "r11", "r12", "r14",
"q0", "q1", "q2", "q3", "q4", "q0", "q1", "q2", "q3", "q4",
"q5", "q6", "q7", "q8", "q9", "q5", "q6", "q7", "q8", "q9",