From 6564d0336953792994a73fb521ca708790cca8f8 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 24 May 2019 09:43:08 +1000 Subject: [PATCH 01/21] Fix and improvements for X25519 x86_64 ASM code --- wolfcrypt/src/fe_x25519_asm.S | 423 +++++++++++++++++----------------- 1 file changed, 211 insertions(+), 212 deletions(-) diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index b052136aa..c4d2075eb 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -157,16 +157,15 @@ fe_frombytes: _fe_frombytes: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r9 - # Copy movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 + andq %r9, %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) - andq %r9, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_frombytes,.-fe_frombytes @@ -1264,7 +1263,7 @@ _fe_mul_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -1415,7 +1414,7 @@ _fe_sq_x64: movq $19, %rax adcq %rdx, %r13 mulq %r14 - # Add remaining produce results in + # Add remaining product results in addq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 @@ -1629,7 +1628,7 @@ _fe_sq2_x64: mulq %r14 # Add remaining produce results in addq %r15, %rcx - addq %r11, %r8 + adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %rax, %r10 @@ -2045,68 +2044,22 @@ L_curve25519_x64_bits: xorq %r10, 48(%rsp) xorq %r11, 56(%rsp) movq %rbp, %rbx - # Sub + # Add movq 64(%rsp), %rcx movq 72(%rsp), %r9 movq 80(%rsp), %r10 - movq 88(%rsp), %r11 - subq 32(%rsp), %rcx - movq $0x00, %rbp - sbbq 40(%rsp), %r9 + movq 88(%rsp), %rbp + movq %rcx, %r12 + addq 32(%rsp), %rcx + movq %r9, %r13 + adcq 40(%rsp), %r9 + movq %r10, %r14 + adcq 48(%rsp), %r10 + movq %rbp, %r15 + adcq 56(%rsp), %rbp movq $-19, %rax - sbbq 48(%rsp), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 56(%rsp), %r11 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %rcx - adcq %rbp, %r9 - adcq %rbp, %r10 - adcq %rdx, %r11 - movq %rcx, 96(%rsp) - movq %r9, 104(%rsp) - movq %r10, 112(%rsp) - movq %r11, 120(%rsp) - # Sub - movq (%rdi), %rcx - movq 8(%rdi), %r9 - movq 16(%rdi), %r10 - movq 24(%rdi), %r11 - subq (%rsp), %rcx - movq $0x00, %rbp - sbbq 8(%rsp), %r9 - movq $-19, %rax - sbbq 16(%rsp), %r10 - movq $0x7fffffffffffffff, %rdx - sbbq 24(%rsp), %r11 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %rcx - adcq %rbp, %r9 - adcq %rbp, %r10 - adcq %rdx, %r11 - movq %rcx, 128(%rsp) - movq %r9, 136(%rsp) - movq %r10, 144(%rsp) - movq %r11, 152(%rsp) - # Add - movq (%rdi), %rcx - movq 8(%rdi), %r9 - addq (%rsp), %rcx - movq 16(%rdi), %r10 - adcq 8(%rsp), %r9 - movq 24(%rdi), %rbp - adcq 16(%rsp), %r10 - movq $-19, %rax - adcq 24(%rsp), %rbp - movq $0x7fffffffffffffff, %rdx movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax @@ -2116,22 +2069,47 @@ L_curve25519_x64_bits: sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 + # Sub + subq 32(%rsp), %r12 + movq $0x00, %rbp + sbbq 40(%rsp), %r13 + movq $-19, %rax + sbbq 48(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 56(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) + movq %r12, 96(%rsp) + movq %r13, 104(%rsp) + movq %r14, 112(%rsp) + movq %r15, 120(%rsp) # Add - movq 64(%rsp), %rcx - movq 72(%rsp), %r9 - addq 32(%rsp), %rcx - movq 80(%rsp), %r10 - adcq 40(%rsp), %r9 - movq 88(%rsp), %rbp - adcq 48(%rsp), %r10 + movq (%rdi), %rcx + movq 8(%rdi), %r9 + movq 16(%rdi), %r10 + movq 24(%rdi), %rbp + movq %rcx, %r12 + addq (%rsp), %rcx + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 + adcq 16(%rsp), %r10 + movq %rbp, %r15 + adcq 24(%rsp), %rbp movq $-19, %rax - adcq 56(%rsp), %rbp - movq $0x7fffffffffffffff, %rdx movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax @@ -2141,10 +2119,31 @@ L_curve25519_x64_bits: sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 + # Sub + subq (%rsp), %r12 + movq $0x00, %rbp + sbbq 8(%rsp), %r13 + movq $-19, %rax + sbbq 16(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 24(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) + movq %r12, 128(%rsp) + movq %r13, 136(%rsp) + movq %r14, 144(%rsp) + movq %r15, 152(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rax @@ -2270,7 +2269,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -2423,7 +2422,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -2549,7 +2548,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -2675,7 +2674,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -2706,15 +2705,19 @@ L_curve25519_x64_bits: # Add movq 32(%rsp), %rcx movq 40(%rsp), %r9 - addq (%rsp), %rcx movq 48(%rsp), %r10 - adcq 8(%rsp), %r9 movq 56(%rsp), %rbp + movq %rcx, %r12 + addq (%rsp), %rcx + movq %r9, %r13 + adcq 8(%rsp), %r9 + movq %r10, %r14 adcq 16(%rsp), %r10 - movq $-19, %rax + movq %rbp, %r15 adcq 24(%rsp), %rbp - movq $0x7fffffffffffffff, %rdx + movq $-19, %rax movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax @@ -2724,35 +2727,31 @@ L_curve25519_x64_bits: sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 - movq %rcx, 64(%rsp) - movq %r9, 72(%rsp) - movq %r10, 80(%rsp) - movq %r11, 88(%rsp) # Sub - movq 32(%rsp), %rcx - movq 40(%rsp), %r9 - movq 48(%rsp), %r10 - movq 56(%rsp), %r11 - subq (%rsp), %rcx + subq (%rsp), %r12 movq $0x00, %rbp - sbbq 8(%rsp), %r9 + sbbq 8(%rsp), %r13 movq $-19, %rax - sbbq 16(%rsp), %r10 + sbbq 16(%rsp), %r14 movq $0x7fffffffffffffff, %rdx - sbbq 24(%rsp), %r11 + sbbq 24(%rsp), %r15 sbbq $0x00, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Add modulus (if underflow) - addq %rax, %rcx - adcq %rbp, %r9 - adcq %rbp, %r10 - adcq %rdx, %r11 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 + movq %rcx, 64(%rsp) + movq %r9, 72(%rsp) + movq %r10, 80(%rsp) + movq %r11, 88(%rsp) + movq %r12, (%rsp) + movq %r13, 8(%rsp) + movq %r14, 16(%rsp) + movq %r15, 24(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax @@ -2878,7 +2877,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -3029,7 +3028,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -3188,7 +3187,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -3366,7 +3365,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -3519,7 +3518,7 @@ L_curve25519_x64_bits: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -3939,7 +3938,7 @@ L_curve25519_x64_inv_8: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -4405,7 +4404,7 @@ _fe_ge_to_p2_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -4561,7 +4560,7 @@ _fe_ge_to_p2_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -4717,7 +4716,7 @@ _fe_ge_to_p2_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -4905,7 +4904,7 @@ _fe_ge_to_p3_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5061,7 +5060,7 @@ _fe_ge_to_p3_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5217,7 +5216,7 @@ _fe_ge_to_p3_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5373,7 +5372,7 @@ _fe_ge_to_p3_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5535,7 +5534,7 @@ _fe_ge_dbl_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5663,7 +5662,7 @@ _fe_ge_dbl_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -5803,7 +5802,7 @@ _fe_ge_dbl_x64: mulq %r15 # Add remaining produce results in addq %rcx, %r8 - addq %r12, %r9 + adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 @@ -5958,7 +5957,7 @@ _fe_ge_dbl_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -6316,7 +6315,7 @@ _fe_ge_madd_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -6472,7 +6471,7 @@ _fe_ge_madd_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -6628,7 +6627,7 @@ _fe_ge_madd_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -7014,7 +7013,7 @@ _fe_ge_msub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -7170,7 +7169,7 @@ _fe_ge_msub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -7326,7 +7325,7 @@ _fe_ge_msub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -7712,7 +7711,7 @@ _fe_ge_add_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -7868,7 +7867,7 @@ _fe_ge_add_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -8024,7 +8023,7 @@ _fe_ge_add_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -8180,7 +8179,7 @@ _fe_ge_add_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -8566,7 +8565,7 @@ _fe_ge_sub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -8722,7 +8721,7 @@ _fe_ge_sub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -8878,7 +8877,7 @@ _fe_ge_sub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -9034,7 +9033,7 @@ _fe_ge_sub_x64: movq $19, %rax adcq %rdx, %r14 mulq %r15 - # Add remaining produce results in + # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 @@ -10052,68 +10051,22 @@ L_curve25519_avx2_bits: xorq %r11, 48(%rsp) xorq %r12, 56(%rsp) movq %rax, 184(%rsp) - # Sub - movq 64(%rsp), %r9 - movq 72(%rsp), %r10 - movq 80(%rsp), %r11 - movq 88(%rsp), %r12 - subq 32(%rsp), %r9 - movq $0x00, %rax - sbbq 40(%rsp), %r10 - movq $-19, %rcx - sbbq 48(%rsp), %r11 - movq $0x7fffffffffffffff, %rbx - sbbq 56(%rsp), %r12 - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r9 - adcq %rax, %r10 - adcq %rax, %r11 - adcq %rbx, %r12 - movq %r9, 96(%rsp) - movq %r10, 104(%rsp) - movq %r11, 112(%rsp) - movq %r12, 120(%rsp) - # Sub - movq (%rdi), %r9 - movq 8(%rdi), %r10 - movq 16(%rdi), %r11 - movq 24(%rdi), %r12 - subq (%rsp), %r9 - movq $0x00, %rax - sbbq 8(%rsp), %r10 - movq $-19, %rcx - sbbq 16(%rsp), %r11 - movq $0x7fffffffffffffff, %rbx - sbbq 24(%rsp), %r12 - sbbq $0x00, %rax - # Mask the modulus - andq %rax, %rcx - andq %rax, %rbx - # Add modulus (if underflow) - addq %rcx, %r9 - adcq %rax, %r10 - adcq %rax, %r11 - adcq %rbx, %r12 - movq %r9, 128(%rsp) - movq %r10, 136(%rsp) - movq %r11, 144(%rsp) - movq %r12, 152(%rsp) # Add movq (%rdi), %r9 movq 8(%rdi), %r10 - addq (%rsp), %r9 movq 16(%rdi), %r11 - adcq 8(%rsp), %r10 movq 24(%rdi), %rax + movq %r9, %r13 + addq (%rsp), %r9 + movq %r10, %r14 + adcq 8(%rsp), %r10 + movq %r11, %r15 adcq 16(%rsp), %r11 - movq $-19, %rcx + movq %rax, %rbp adcq 24(%rsp), %rax - movq $0x7fffffffffffffff, %rbx + movq $-19, %rcx movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx @@ -10123,22 +10076,47 @@ L_curve25519_avx2_bits: sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 + # Sub + subq (%rsp), %r13 + movq $0x00, %rax + sbbq 8(%rsp), %r14 + movq $-19, %rcx + sbbq 16(%rsp), %r15 + movq $0x7fffffffffffffff, %rbx + sbbq 24(%rsp), %rbp + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) + movq %r13, 128(%rsp) + movq %r14, 136(%rsp) + movq %r15, 144(%rsp) + movq %rbp, 152(%rsp) # Add movq 64(%rsp), %r9 movq 72(%rsp), %r10 - addq 32(%rsp), %r9 movq 80(%rsp), %r11 - adcq 40(%rsp), %r10 movq 88(%rsp), %rax + movq %r9, %r13 + addq 32(%rsp), %r9 + movq %r10, %r14 + adcq 40(%rsp), %r10 + movq %r11, %r15 adcq 48(%rsp), %r11 - movq $-19, %rcx + movq %rax, %rbp adcq 56(%rsp), %rax - movq $0x7fffffffffffffff, %rbx + movq $-19, %rcx movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx @@ -10148,10 +10126,31 @@ L_curve25519_avx2_bits: sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 + # Sub + subq 32(%rsp), %r13 + movq $0x00, %rax + sbbq 40(%rsp), %r14 + movq $-19, %rcx + sbbq 48(%rsp), %r15 + movq $0x7fffffffffffffff, %rbx + sbbq 56(%rsp), %rbp + sbbq $0x00, %rax + # Mask the modulus + andq %rax, %rcx + andq %rax, %rbx + # Add modulus (if underflow) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) + movq %r13, 96(%rsp) + movq %r14, 104(%rsp) + movq %r15, 112(%rsp) + movq %rbp, 120(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rdx @@ -10607,15 +10606,19 @@ L_curve25519_avx2_bits: # Add movq 32(%rsp), %r9 movq 40(%rsp), %r10 - addq (%rsp), %r9 movq 48(%rsp), %r11 - adcq 8(%rsp), %r10 movq 56(%rsp), %rax + movq %r9, %r13 + addq (%rsp), %r9 + movq %r10, %r14 + adcq 8(%rsp), %r10 + movq %r11, %r15 adcq 16(%rsp), %r11 - movq $-19, %rcx + movq %rax, %rbp adcq 24(%rsp), %rax - movq $0x7fffffffffffffff, %rbx + movq $-19, %rcx movq %rax, %r12 + movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx @@ -10625,35 +10628,31 @@ L_curve25519_avx2_bits: sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 - movq %r9, 64(%rsp) - movq %r10, 72(%rsp) - movq %r11, 80(%rsp) - movq %r12, 88(%rsp) # Sub - movq 32(%rsp), %r9 - movq 40(%rsp), %r10 - movq 48(%rsp), %r11 - movq 56(%rsp), %r12 - subq (%rsp), %r9 + subq (%rsp), %r13 movq $0x00, %rax - sbbq 8(%rsp), %r10 + sbbq 8(%rsp), %r14 movq $-19, %rcx - sbbq 16(%rsp), %r11 + sbbq 16(%rsp), %r15 movq $0x7fffffffffffffff, %rbx - sbbq 24(%rsp), %r12 + sbbq 24(%rsp), %rbp sbbq $0x00, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Add modulus (if underflow) - addq %rcx, %r9 - adcq %rax, %r10 - adcq %rax, %r11 - adcq %rbx, %r12 - movq %r9, (%rsp) - movq %r10, 8(%rsp) - movq %r11, 16(%rsp) - movq %r12, 24(%rsp) + addq %rcx, %r13 + adcq %rax, %r14 + adcq %rax, %r15 + adcq %rbx, %rbp + movq %r9, 64(%rsp) + movq %r10, 72(%rsp) + movq %r11, 80(%rsp) + movq %r12, 88(%rsp) + movq %r13, (%rsp) + movq %r14, 8(%rsp) + movq %r15, 16(%rsp) + movq %rbp, 24(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rdx From 87fb9f73e9ee372ac9cc2fd6eea400f53a14346f Mon Sep 17 00:00:00 2001 From: Tesfa Mael Date: Tue, 21 May 2019 14:09:50 -0700 Subject: [PATCH 02/21] Added RISC-V SiFive FE310 support Added freedom-e-sdk based makefile Updated ecc test code to not allocate more memory than requried Run wolfcrypt and benchmark tests on the SiFive HiFive1 targets --- IDE/ECLIPSE/SIFIVE/Makefile | 31 ++ IDE/ECLIPSE/SIFIVE/README.md | 185 ++++++++++ IDE/ECLIPSE/SIFIVE/include.am | 9 + IDE/ECLIPSE/SIFIVE/main.c | 115 ++++++ IDE/ECLIPSE/SIFIVE/user_settings.h | 566 +++++++++++++++++++++++++++++ wolfcrypt/src/random.c | 3 +- wolfcrypt/test/test.c | 18 +- 7 files changed, 921 insertions(+), 6 deletions(-) create mode 100644 IDE/ECLIPSE/SIFIVE/Makefile create mode 100644 IDE/ECLIPSE/SIFIVE/README.md create mode 100644 IDE/ECLIPSE/SIFIVE/include.am create mode 100644 IDE/ECLIPSE/SIFIVE/main.c create mode 100644 IDE/ECLIPSE/SIFIVE/user_settings.h diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile new file mode 100644 index 000000000..9cb031127 --- /dev/null +++ b/IDE/ECLIPSE/SIFIVE/Makefile @@ -0,0 +1,31 @@ +PROGRAM ?= wolfcrypt + +# This line must be added in your freedom-e-sdk/scripts/standalone.mk +# RISCV_CFLAGS += -I$(WOLFSSL_SRC_DIR) -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE -DWOLFSSL_USER_SETTINGS +# WOLFSSL_SRC_DIR variable must be set in the environment when GNU make is started. + +WOLFSSL_CFLAGS += -I$(WOLFSSL_SRC_DIR) \ + -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE \ + -DWOLFSSL_USER_SETTINGS + +SRC_FILES = $(wildcard $(WOLFSSL_SRC_DIR)/src/*.c) +SRC_FILES += $(wildcard $(WOLFSSL_SRC_DIR)/wolfcrypt/src/*.c) +SRC_FILES := $(filter-out %bio.c %misc.c %evp.c, $(SRC_FILES)) + +SRC =$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE/main.c \ + $(SRC_FILES) \ + $(WOLFSSL_SRC_DIR)/wolfcrypt/test/test.c \ + $(WOLFSSL_SRC_DIR)/wolfcrypt/benchmark/benchmark.c + +OPT_CFLAGS = -specs=nano.specs +#OPT_CFLAGS += -O3 -DTIME -DNOENUM -Wno-implicit -mexplicit-relocs -save-temps +#OPT_CFLAGS += -fno-inline -fno-builtin-printf -fno-common -falign-functions=4 + +override CFLAGS += $(OPT_CFLAGS) $(WOLFSSL_CFLAGS) \ + -Xlinker --defsym=__stack_size=0x1000 + +$(PROGRAM): $(SRC) + $(CC) $(CFLAGS) $(SRC) $(LDFLAGS) $(LDLIBS) -o $@ + +clean: + rm -f $(PROGRAM) $(PROGRAM).hex diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md new file mode 100644 index 000000000..a6f0d6cbb --- /dev/null +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -0,0 +1,185 @@ +# SiFive RISC-V HiFive Port +## Overview +You can enable the wolfSSL support for RISC-V using the `#define WOLFSSL_SIFIVE_RISC_V`. + +## Prerequisites +1. Follow the instructions on the SiFive GitHub [here](https://github.com/sifive/freedom-e-sdk) and SiFive website [here](https://www.sifive.com/) to download the freedom-e-sdk and software tools. +3. Run a simple hello application on your development board to confirm that your board functions as expected and the communication between your computer and the board works. + +## Usage +You can start with a wolfcrypt example project to integrate the wolfSSL source code. +wolfSSL supports a compile-time user configurable options in the `IDE/ECLIPSE/SIFIVE/user_settings.h` file. + +The `IDE/ECLIPSE/SIFIVE/main.c` example application provides a function to run the selected examples at compile time through the following two #defines in user_settings.h. You can define these macro options to disable the test run. +``` +- #undef NO_CRYPT_TEST +- #undef NO_CRYPT_BENCHMARK +``` + +## Setup +### Setting up the SDK with wolfSSL +1. Download the wolfSSL source code or a zip file from GitHub and place it under your SDK `$HOME` directory. You can also copy or simlink to the source. +``` + For example, + $ cd $HOME + $ git clone --depth=1 https://github.com/wolfSSL/wolfssl.git + +``` +2. Copy the wolfcrypt example project into your `freedom-e-sdk/software` directory. + +``` + $ cp -rf ~/wolfssl/IDE/ECLIPSE/SIFIVE ~/freedom-e-sdk/software/wolfcrypt +``` + +3. Edit your `~/freedom-e-sdk/scripts/standalone.mk` and add the following line after the last RISCV_CFLAGS entry: + +``` + RISCV_CFLAGS += -I$(WOLFSSL_SRC_DIR) -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE -DWOLFSSL_USER_SETTINGS +``` + +4. WOLFSSL_SRC_DIR variable must be set in the environment when GNU make is started. + +``` + $ export WOLFSSL_SRC_DIR=~/wolfssl +``` + +5. Setup your riscv64 compiler + +``` + $ export RISCV_OPENOCD_PATH=/opt/riscv-openocd +``` +6. (Optional) Setup OpenOCD if your target supports it: + +``` + $ export RISCV_OPENOCD_PATH=/opt/riscv-openocd +``` +## Building and Running + +You can build from source or create a static library. + +1. Using command-line: + +``` + $ cd freedom-e-sdk + $ make PROGRAM=wolfcrypt TARGET=sifive-hifive1-revb CONFIGURATION=debug clean software upload +``` +This example cleans, builds and uploads the software on the sifive-hifive1-revb target but you can also combine and build for any of the supported targets. + +Review the test results on the target console. + +2. Building a static library for RISC-V using a cross-compiler: + +``` +$ cd $WOLFSSL_SRC_DIR + +$./configure --host=riscv64-unknown-elf \ +CC=riscv64-unknown-elf-gcc \ +AR=riscv64-unknown-elf-ar \ +AS=riscv64-unknown-elf-as \ +RANLIB=$RISCV_PATH/bin/riscv64-unknown-elf-gcc-ranlib \ +LD=riscv64-unknown-elf-ld \ +CXX=riscv64-unknown-elf-g++ \ +--disable-examples --enable-static --disable-shared \ +CFLAGS="-march=rv32imac -mabi=ilp32 -mcmodel=medlow -ffunction-sections -fdata-sections -I~/freedom-e-sdk/bsp/sifive-hifive1/install/include -O0 -g -DNO_FILESYSTEM -DWOLFSSL_NO_SOCK -DNO_WRITEV -DWOLFCRYPT_ONLY -DWOLFSSL_GENSEED_FORTEST -DWOLFSSL_SIFIVE_RISC_V" + +$make +$sudo make install +``` +You can now build and link your software to the wolfSSL libwolfssl.a static library. + +### `wolfcrypt_test()` +wolfcrypt_test() prints a message on the target console similar to the following output: +``` +wolfCrypt Test Started +error test passed! +base64 test passed! +asn test passed! +SHA test passed! +SHA-256 test passed! +Hash test passed! +HMAC-SHA test passed! +HMAC-SHA256 test passed! +GMAC test passed! +AES test passed! +AES192 test passed! +AES256 test passed! +AES-GCM test passed! +RANDOM test passed! +ECC test passed! +ECC buffer test passed! +logging test passed! +mutex test passed! +Test complete +... +wolfCrypt Test Completed +``` +### `benchmark_test()` +benchmark_test() prints a message on the target console similar to the following output. +TARGET=sifive-hifive1-revb: +``` +------------------------------------------------------------------------------ + wolfSSL version 4.0.0 +------------------------------------------------------------------------------ +wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) +RNG 25 KB took 3.000 seconds, 8.333 KB/s +AES-128-CBC-enc 25 KB took 16.000 seconds, 1.562 KB/s +AES-128-CBC-dec 25 KB took 17.000 seconds, 1.471 KB/s +AES-192-CBC-enc 25 KB took 19.000 seconds, 1.316 KB/s +AES-192-CBC-dec 25 KB took 18.000 seconds, 1.389 KB/s +AES-256-CBC-enc 25 KB took 20.000 seconds, 1.250 KB/s +AES-256-CBC-dec 25 KB took 21.000 seconds, 1.190 KB/s +AES-128-GCM-enc 25 KB took 30.000 seconds, 0.833 KB/s +AES-128-GCM-dec 25 KB took 30.000 seconds, 0.833 KB/s +AES-192-GCM-enc 25 KB took 32.000 seconds, 0.781 KB/s +AES-192-GCM-dec 25 KB took 32.000 seconds, 0.781 KB/s +AES-256-GCM-enc 25 KB took 34.000 seconds, 0.735 KB/s +AES-256-GCM-dec 25 KB took 34.000 seconds, 0.735 KB/s +SHA 50 KB took 1.000 seconds, 50.000 KB/s +SHA-256 25 KB took 1.000 seconds, 25.000 KB/s +HMAC-SHA 50 KB took 1.000 seconds, 50.000 KB/s +HMAC-SHA256 25 KB took 1.000 seconds, 25.000 KB/s +ECC 256 key gen 1 ops took 11.000 sec, avg 11000.000 ms, 0.091 ops/sec +ECDHE 256 agree 2 ops took 22.000 sec, avg 11000.000 ms, 0.091 ops/sec +ECDSA 256 sign 2 ops took 23.000 sec, avg 11500.000 ms, 0.087 ops/sec +ECDSA 256 verify 2 ops took 45.000 sec, avg 22500.000 ms, 0.044 ops/sec +Benchmark complete + + +``` +TARGET=sifive-hifive1 +``` +------------------------------------------------------------------------------ + wolfSSL version 4.0.0 +------------------------------------------------------------------------------ +wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) +RNG 25 KB took 2.000 seconds, 12.500 KB/s +AES-128-CBC-enc 25 KB took 17.000 seconds, 1.471 KB/s +AES-128-CBC-dec 25 KB took 17.000 seconds, 1.471 KB/s +AES-192-CBC-enc 25 KB took 18.000 seconds, 1.389 KB/s +AES-192-CBC-dec 25 KB took 18.000 seconds, 1.389 KB/s +AES-256-CBC-enc 25 KB took 20.000 seconds, 1.250 KB/s +AES-256-CBC-dec 25 KB took 20.000 seconds, 1.250 KB/s +AES-128-GCM-enc 25 KB took 31.000 seconds, 0.806 KB/s +AES-128-GCM-dec 25 KB took 30.000 seconds, 0.833 KB/s +AES-192-GCM-enc 25 KB took 33.000 seconds, 0.758 KB/s +AES-192-GCM-dec 25 KB took 33.000 seconds, 0.758 KB/s +AES-256-GCM-enc 25 KB took 34.000 seconds, 0.735 KB/s +AES-256-GCM-dec 25 KB took 35.000 seconds, 0.714 KB/s +SHA 50 KB took 1.000 seconds, 50.000 KB/s +SHA-256 25 KB took 1.000 seconds, 25.000 KB/s +HMAC-SHA 25 KB took 1.000 seconds, 25.000 KB/s +HMAC-SHA256 25 KB took 1.000 seconds, 25.000 KB/s +ECC 256 key gen 1 ops took 12.000 sec, avg 12000.000 ms, 0.083 ops/sec +ECDHE 256 agree 2 ops took 24.000 sec, avg 12000.000 ms, 0.083 ops/sec +ECDSA 256 sign 2 ops took 25.000 sec, avg 12500.000 ms, 0.080 ops/sec +ECDSA 256 verify 2 ops took 48.000 sec, avg 24000.000 ms, 0.042 ops/sec +Benchmark complete +``` + +## References +The test results were collected from a SiFive reference platform target with the following hardware, software and tool chains: +- HiFive1 Rev A/Rev B: HiFive1 Development Board with the Freedom Everywhere SoC, E300 +- freedom-e-sdk +- wolfssl [latest version](https://github.com/wolfSSL/wolfssl) + +For more information or questions, please email [support@wolfssl.com](mailto:support@wolfssl.com) diff --git a/IDE/ECLIPSE/SIFIVE/include.am b/IDE/ECLIPSE/SIFIVE/include.am new file mode 100644 index 000000000..5f9550dc2 --- /dev/null +++ b/IDE/ECLIPSE/SIFIVE/include.am @@ -0,0 +1,9 @@ +# vim:ft=automake +# included from Top Level Makefile.am +# All paths should be given relative to the root + +EXTRA_DIST += \ + IDE/ECLIPSE/SIFIVE/README.md \ + IDE/ECLIPSE/SIFIVE/main.c \ + IDE/ECLIPSE/SIFIVE/Makefile\ + IDE/ECLIPSE/SIFIVE/user_settings.h diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c new file mode 100644 index 000000000..d304d77a6 --- /dev/null +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -0,0 +1,115 @@ +/* main.c + * + * Copyright (C) 2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ +#include +#include +#include + +/* wolfCrypt_Init/wolfCrypt_Cleanup */ +#include +#include +#include +#include + +#ifndef NO_CRYPT_BENCHMARK + +/*-specs=nano.specs doesn’t include support for floating point in printf()*/ +asm (".global _printf_float"); + +#define RTC_FREQ 32768 +#define CLINT_MTIME_ADDR 0x200bff8 +#define WOLFSSL_SIFIVE_RISC_V_DEBUG 0 + +double current_time(int reset) +{ + volatile uint64_t * mtime = (uint64_t*) (CLINT_MTIME_ADDR); + uint64_t now = *mtime; + (void)reset; + return now/RTC_FREQ; +} +#endif + +void check(int depth) { + char ch; + char *ptr = malloc(1); + + printf("stack at %p, heap at %p\n", &ch, ptr); + if (depth <= 0) + return; + + check(depth-1); +} + +void mtime_sleep( uint64_t ticks) { + volatile uint64_t * mtime = (uint64_t*) (CLINT_MTIME_ADDR); + uint64_t now = *mtime; + uint64_t then = now + ticks; + + while((*mtime - now) < ticks) { + + } +} + +void delay(int sec) { + uint64_t ticks = sec * RTC_FREQ; + mtime_sleep(ticks); +} + +int main(void) +{ + int ret; + +#if WOLFSSL_SIFIVE_RISC_V_DEBUG + printf("check stack and heap addresses\n"); + check(10); + printf("sleep for 10 seconds to verify timer\n"); + delay(10); + printf("awake after sleeping for 10 seconds\n"); +#endif + + #ifdef DEBUG_WOLFSSL + wolfSSL_Debugging_ON(); + #endif + #ifdef HAVE_STACK_SIZE + StackSizeCheck(&args, server_test); + #endif + if ((ret = wolfCrypt_Init()) != 0) { + printf("wolfCrypt_Init failed %d\n", ret); + return -1; + } + +#ifndef NO_CRYPT_TEST + printf("\nwolfCrypt Test Started\n"); + wolfcrypt_test(NULL); + printf("\nwolfCrypt Test Completed\n"); +#endif + +#ifndef NO_CRYPT_BENCHMARK + printf("\nBenchmark Test Started\n"); + benchmark_test(NULL); + printf("\nBenchmark Test Completed\n"); +#endif + if ((ret = wolfCrypt_Cleanup()) != 0) { + printf("wolfCrypt_Cleanup failed %d\n", ret); + return -1; + } + return 0; +} + diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h new file mode 100644 index 000000000..c2f66b4d7 --- /dev/null +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -0,0 +1,566 @@ +/* user_settings.h + * + * Copyright (C) 2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Example Settings for SiFive HiFive1 */ + +#ifndef WOLFSSL_USER_SETTINGS_H +#define WOLFSSL_USER_SETTINGS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* ------------------------------------------------------------------------- */ +/* SiFive HiFive */ +/* ------------------------------------------------------------------------- */ +#undef WOLFSSL_SIFIVE_RISC_V +#define WOLFSSL_SIFIVE_RISC_V + + +/* ------------------------------------------------------------------------- */ +/* Platform */ +/* ------------------------------------------------------------------------- */ + +#undef WOLFSSL_GENERAL_ALIGNMENT +#define WOLFSSL_GENERAL_ALIGNMENT 4 + +#undef SINGLE_THREADED +#define SINGLE_THREADED + +#undef WOLFSSL_SMALL_STACK +#define WOLFSSL_SMALL_STACK + +#undef WOLFSSL_USER_IO +#define WOLFSSL_USER_IO + + +/* ------------------------------------------------------------------------- */ +/* Math Configuration */ +/* ------------------------------------------------------------------------- */ +#undef SIZEOF_LONG_LONG +#define SIZEOF_LONG_LONG 8 + +#undef USE_FAST_MATH + +#if 1 + #define USE_FAST_MATH + + #undef TFM_TIMING_RESISTANT + #define TFM_TIMING_RESISTANT + + /* Optimizations */ + //#define TFM_ARM +#endif + +/* ------------------------------------------------------------------------- */ +/* Crypto */ +/* ------------------------------------------------------------------------- */ +/* RSA */ +#undef NO_RSA +#if 0 + #ifdef USE_FAST_MATH + /* Maximum math bits (Max RSA key bits * 2) */ + #undef FP_MAX_BITS + #define FP_MAX_BITS 4096 + #endif + + /* half as much memory but twice as slow */ + #undef RSA_LOW_MEM + #define RSA_LOW_MEM + + /* Enables blinding mode, to prevent timing attacks */ + #if 1 + #undef WC_RSA_BLINDING + #define WC_RSA_BLINDING + #else + #undef WC_NO_HARDEN + #define WC_NO_HARDEN + #endif + + /* RSA PSS Support */ + #if 0 + #define WC_RSA_PSS + #endif + + #if 0 + #define WC_RSA_NO_PADDING + #endif +#else + #define NO_RSA +#endif + +/* ECC */ +#undef HAVE_ECC +#if 1 + #define HAVE_ECC + + /* Manually define enabled curves */ + #undef ECC_USER_CURVES + #define ECC_USER_CURVES + + #ifdef ECC_USER_CURVES + /* Manual Curve Selection */ + //#define HAVE_ECC192 + //#define HAVE_ECC224 + #undef NO_ECC256 + //#define HAVE_ECC384 + //#define HAVE_ECC521 + #endif + + /* Fixed point cache (speeds repeated operations against same private key) */ + #undef FP_ECC + //#define FP_ECC + #ifdef FP_ECC + /* Bits / Entries */ + #undef FP_ENTRIES + #define FP_ENTRIES 2 + #undef FP_LUT + #define FP_LUT 4 + #endif + + /* Optional ECC calculation method */ + /* Note: doubles heap usage, but slightly faster */ + #undef ECC_SHAMIR + //#define ECC_SHAMIR + + /* Reduces heap usage, but slower */ + #undef ECC_TIMING_RESISTANT + #define ECC_TIMING_RESISTANT + + /* Enable cofactor support */ + #undef HAVE_ECC_CDH + //#define HAVE_ECC_CDH + + /* Validate import */ + #undef WOLFSSL_VALIDATE_ECC_IMPORT + //#define WOLFSSL_VALIDATE_ECC_IMPORT + + /* Compressed Key Support */ + #undef HAVE_COMP_KEY + //#define HAVE_COMP_KEY + + /* Use alternate ECC size for ECC math */ + #ifdef USE_FAST_MATH + #ifdef NO_RSA + /* Custom fastmath size if not using RSA */ + /* MAX = ROUND32(ECC BITS 256) + SIZE_OF_MP_DIGIT(32) */ + #undef FP_MAX_BITS + #define FP_MAX_BITS (256 + 32) + #else + #undef ALT_ECC_SIZE + #define ALT_ECC_SIZE + #endif + + /* Speedups specific to curve */ + #ifndef NO_ECC256 + #undef TFM_ECC256 + //#define TFM_ECC256 + #endif + #ifndef HAVE_ECC384 + #undef TFM_ECC384 + //#define TFM_ECC384 + #endif + #endif +#endif + +/* DH */ +#undef NO_DH +#if 0 + /* Use table for DH instead of -lm (math) lib dependency */ + #if 0 + #define WOLFSSL_DH_CONST + #endif + + #define HAVE_FFDHE_2048 + //#define HAVE_FFDHE_4096 + //#define HAVE_FFDHE_6144 + //#define HAVE_FFDHE_8192 +#else + #define NO_DH +#endif + + +/* AES */ +#undef NO_AES +#if 1 + #undef HAVE_AES_CBC + #define HAVE_AES_CBC + + /* If you need other than AES-CBC mode, you must undefine WOLFSSL_CRYPTOCELL_AES */ + #if !defined(WOLFSSL_CRYPTOCELL_AES) + #undef HAVE_AESGCM + #define HAVE_AESGCM + + /* GCM Method: GCM_SMALL, GCM_WORD32 or GCM_TABLE */ + #define GCM_SMALL + + #undef WOLFSSL_AES_DIRECT + //#define WOLFSSL_AES_DIRECT + + #undef HAVE_AES_ECB + //#define HAVE_AES_ECB + + #undef WOLFSSL_AES_COUNTER + //#define WOLFSSL_AES_COUNTER + + #undef HAVE_AESCCM + //#define HAVE_AESCCM + #endif +#else + #define NO_AES +#endif + + +/* DES3 */ +#undef NO_DES3 +#if 0 +#else + #define NO_DES3 +#endif + +/* ChaCha20 / Poly1305 */ +#undef HAVE_CHACHA +#undef HAVE_POLY1305 +#if 0 + #define HAVE_CHACHA + #define HAVE_POLY1305 + + /* Needed for Poly1305 */ + #undef HAVE_ONE_TIME_AUTH + #define HAVE_ONE_TIME_AUTH +#endif + +/* Ed25519 / Curve25519 */ +#undef HAVE_CURVE25519 +#undef HAVE_ED25519 +#if 0 + #define HAVE_CURVE25519 + #define HAVE_ED25519 /* ED25519 Requires SHA512 */ + + /* Optionally use small math (less flash usage, but much slower) */ + #if 1 + #define CURVED25519_SMALL + #endif +#endif + + +/* ------------------------------------------------------------------------- */ +/* Hashing */ +/* ------------------------------------------------------------------------- */ +/* Sha */ +#undef NO_SHA +#if 1 + /* 1k smaller, but 25% slower */ + //#define USE_SLOW_SHA +#else + #define NO_SHA +#endif + +/* Sha256 */ +#undef NO_SHA256 +#if 1 + /* not unrolled - ~2k smaller and ~25% slower */ + //#define USE_SLOW_SHA256 + + /* Sha224 */ + #if 0 + #define WOLFSSL_SHA224 + #endif +#else + #define NO_SHA256 +#endif + +/* Sha512 */ +#undef WOLFSSL_SHA512 +#if 0 + #define WOLFSSL_SHA512 + + /* Sha384 */ + #undef WOLFSSL_SHA384 + #if 0 + #define WOLFSSL_SHA384 + #endif + + /* over twice as small, but 50% slower */ + //#define USE_SLOW_SHA512 +#endif + +/* Sha3 */ +#undef WOLFSSL_SHA3 +#if 0 + #define WOLFSSL_SHA3 +#endif + +/* MD5 */ +#undef NO_MD5 +#if 0 + +#else + #define NO_MD5 +#endif + +/* HKDF */ +#undef HAVE_HKDF +#if 0 + #define HAVE_HKDF +#endif + +/* CMAC */ +#undef WOLFSSL_CMAC +#if 0 + #define WOLFSSL_CMAC +#endif + + +/* ------------------------------------------------------------------------- */ +/* Benchmark / Test */ +/* ------------------------------------------------------------------------- */ +/* Use reduced benchmark / test sizes */ +#undef BENCH_EMBEDDED +#define BENCH_EMBEDDED + +#undef USE_CERT_BUFFERS_2048 +//#define USE_CERT_BUFFERS_2048 + +#undef USE_CERT_BUFFERS_1024 +//#define USE_CERT_BUFFERS_1024 + +#undef USE_CERT_BUFFERS_256 +#define USE_CERT_BUFFERS_256 + + +/* ------------------------------------------------------------------------- */ +/* Debugging */ +/* ------------------------------------------------------------------------- */ + +#undef DEBUG_WOLFSSL +#undef NO_ERROR_STRINGS +#if 0 + #define DEBUG_WOLFSSL +#else + #if 0 + #define NO_ERROR_STRINGS + #endif +#endif + + +/* ------------------------------------------------------------------------- */ +/* Memory */ +/* ------------------------------------------------------------------------- */ + +/* Override Memory API's */ +#if 0 + #undef XMALLOC_OVERRIDE + #define XMALLOC_OVERRIDE + + /* prototypes for user heap override functions */ + /* Note: Realloc only required for normal math */ + #include /* for size_t */ + extern void *myMalloc(size_t n, void* heap, int type); + extern void myFree(void *p, void* heap, int type); + extern void *myRealloc(void *p, size_t n, void* heap, int type); + + #define XMALLOC(n, h, t) myMalloc(n, h, t) + #define XFREE(p, h, t) myFree(p, h, t) + #define XREALLOC(p, n, h, t) myRealloc(p, n, h, t) +#endif + +#if 0 + /* Static memory requires fast math */ + #define WOLFSSL_STATIC_MEMORY + + /* Disable fallback malloc/free */ + #define WOLFSSL_NO_MALLOC + #if 1 + #define WOLFSSL_MALLOC_CHECK /* trap malloc failure */ + #endif +#endif + +/* Memory callbacks */ +#if 0 + #undef USE_WOLFSSL_MEMORY + #define USE_WOLFSSL_MEMORY + + /* Use this to measure / print heap usage */ + #if 1 + #undef WOLFSSL_TRACK_MEMORY + #define WOLFSSL_TRACK_MEMORY + + #undef WOLFSSL_DEBUG_MEMORY + #define WOLFSSL_DEBUG_MEMORY + #endif +#else + #ifndef WOLFSSL_STATIC_MEMORY + #define NO_WOLFSSL_MEMORY + /* Otherwise we will use stdlib malloc, free and realloc */ + #endif +#endif + + +/* ------------------------------------------------------------------------- */ +/* Port */ +/* ------------------------------------------------------------------------- */ + +/* Override Current Time */ +/* Allows custom "custom_time()" function to be used for benchmark */ +#if defined(WOLFSSL_SIFIVE_RISC_V) + #define WOLFSSL_USER_CURRTIME + #define WOLFSSL_GMTIME + #define USER_TICKS +#endif + +#if !defined(WOLFSSL_SIFIVE_RISC_V) +// extern unsigned long my_time(unsigned long* timer); +// #define XTIME my_time +#endif + +/* ------------------------------------------------------------------------- */ +/* RNG */ +/* ------------------------------------------------------------------------- */ + +#if defined(WOLFSSL_SIFIVE_RISC_V) + /* Override P-RNG with HW RNG */ + //extern int my_random_generate(byte* output, word32 sz); + //#undef CUSTOM_RAND_GENERATE_BLOCK + //#define CUSTOM_RAND_GENERATE_BLOCK my_random_generate + #define WOLFSSL_GENSEED_FORTEST /* for software RNG*/ +#else + #define WOLFSSL_GENSEED_FORTEST +#endif + + +/* ------------------------------------------------------------------------- */ +/* Enable Features */ +/* ------------------------------------------------------------------------- */ +#undef WOLFSSL_TLS13 +#if 0 + #define WOLFSSL_TLS13 +#endif + +#undef WOLFSSL_KEY_GEN +#if 0 + #define WOLFSSL_KEY_GEN +#endif + +/* reduce DH test time */ +#define WOLFSSL_OLD_PRIME_CHECK + +#undef KEEP_PEER_CERT +//#define KEEP_PEER_CERT + +#undef HAVE_COMP_KEY +//#define HAVE_COMP_KEY + +#undef HAVE_TLS_EXTENSIONS +#define HAVE_TLS_EXTENSIONS + +#undef HAVE_SUPPORTED_CURVES +#define HAVE_SUPPORTED_CURVES + +#undef WOLFSSL_BASE64_ENCODE +#define WOLFSSL_BASE64_ENCODE + +/* TLS Session Cache */ +#if 0 + #define SMALL_SESSION_CACHE +#else + #define NO_SESSION_CACHE +#endif + + +/* ------------------------------------------------------------------------- */ +/* Disable Features */ +/* ------------------------------------------------------------------------- */ +#undef NO_WOLFSSL_SERVER +//#define NO_WOLFSSL_SERVER + +#undef NO_WOLFSSL_CLIENT +//#define NO_WOLFSSL_CLIENT + +#undef NO_CRYPT_TEST +//#define NO_CRYPT_TEST + +#undef NO_CRYPT_BENCHMARK +//#define NO_CRYPT_BENCHMARK + +#undef WOLFCRYPT_ONLY +//#define WOLFCRYPT_ONLY + +/* In-lining of misc.c functions */ +/* If defined, must include wolfcrypt/src/misc.c in build */ +/* Slower, but about 1k smaller */ +#undef NO_INLINE +//#define NO_INLINE + +#undef NO_FILESYSTEM +#define NO_FILESYSTEM + +#undef NO_WRITEV +#define NO_WRITEV + +#undef NO_MAIN_DRIVER +#define NO_MAIN_DRIVER + +#undef NO_DEV_RANDOM +#define NO_DEV_RANDOM + +#undef NO_DSA +#define NO_DSA + +#undef NO_RC4 +#define NO_RC4 + +#undef NO_OLD_TLS +#define NO_OLD_TLS + +#undef NO_HC128 +#define NO_HC128 + +#undef NO_RABBIT +#define NO_RABBIT + +#undef NO_PSK +#define NO_PSK + +#undef NO_MD4 +#define NO_MD4 + +#undef NO_PWDBASED +#define NO_PWDBASED + +#undef NO_CODING +//#define NO_CODING + +#undef NO_ASN_TIME +//#define NO_ASN_TIME + +#undef NO_CERTS +//#define NO_CERTS + +#undef NO_SIG_WRAPPER +//#define NO_SIG_WRAPPER + +#ifdef __cplusplus +} +#endif + +#endif /* WOLFSSL_USER_SETTINGS_H */ + diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c index f6751330e..72c74e7f7 100644 --- a/wolfcrypt/src/random.c +++ b/wolfcrypt/src/random.c @@ -2327,12 +2327,13 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) #endif #ifdef USE_TEST_GENSEED +#ifndef WOLFSSL_SIFIVE_RISC_V #ifndef _MSC_VER #warning "write a real random seed!!!!, just for testing now" #else #pragma message("Warning: write a real random seed!!!!, just for testing now") #endif - +#endif /* !WOLFSSL_SIFIVE_RISC_V*/ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) { word32 i; diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index d245277f3..b95a105d2 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -15904,6 +15904,19 @@ static int ecc_test_make_pub(WC_RNG* rng) wc_ecc_init_ex(&key, HEAP_HINT, devId); +#ifdef USE_CERT_BUFFERS_256 + tmp = (byte*)XMALLOC((size_t)sizeof_ecc_key_der_256, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); + if (tmp == NULL) { + return -8311; + } + exportBuf = (byte*)XMALLOC((size_t)sizeof_ecc_key_der_256, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); + if (exportBuf == NULL) { + XFREE(tmp, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); + return -8312; + } + XMEMCPY(tmp, ecc_key_der_256, (size_t)sizeof_ecc_key_der_256); + tmpSz = (size_t)sizeof_ecc_key_der_256; +#else tmp = (byte*)XMALLOC(FOURK_BUF, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); if (tmp == NULL) { return -8311; @@ -15913,11 +15926,6 @@ static int ecc_test_make_pub(WC_RNG* rng) XFREE(tmp, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); return -8312; } - -#ifdef USE_CERT_BUFFERS_256 - XMEMCPY(tmp, ecc_key_der_256, (size_t)sizeof_ecc_key_der_256); - tmpSz = (size_t)sizeof_ecc_key_der_256; -#else file = XFOPEN(eccKeyDerFile, "rb"); if (!file) { ERROR_OUT(-8313, done); From 765b075e5023e64d9c6368192e13c4b8d57ad6e0 Mon Sep 17 00:00:00 2001 From: Tesfa Mael Date: Tue, 28 May 2019 17:37:16 -0700 Subject: [PATCH 03/21] Updated with review comments --- IDE/ECLIPSE/SIFIVE/Makefile | 3 ++ IDE/ECLIPSE/SIFIVE/README.md | 14 +++++-- IDE/ECLIPSE/SIFIVE/main.c | 47 +++++++++++++++++++++-- IDE/ECLIPSE/SIFIVE/user_settings.h | 61 +++++++++++++++--------------- wolfcrypt/src/random.c | 2 - 5 files changed, 88 insertions(+), 39 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile index 9cb031127..81f2620b6 100644 --- a/IDE/ECLIPSE/SIFIVE/Makefile +++ b/IDE/ECLIPSE/SIFIVE/Makefile @@ -21,6 +21,9 @@ OPT_CFLAGS = -specs=nano.specs #OPT_CFLAGS += -O3 -DTIME -DNOENUM -Wno-implicit -mexplicit-relocs -save-temps #OPT_CFLAGS += -fno-inline -fno-builtin-printf -fno-common -falign-functions=4 +# ovewrite the __stack_size default value of 0x400 with 0x1000(4 Kbytes). +# The __stack_size and __heap_size symbols are defined in the linker metal.default.ld +# script in the freedom-e-sdk. override CFLAGS += $(OPT_CFLAGS) $(WOLFSSL_CFLAGS) \ -Xlinker --defsym=__stack_size=0x1000 diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index a6f0d6cbb..bd13c7667 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -15,6 +15,11 @@ The `IDE/ECLIPSE/SIFIVE/main.c` example application provides a function to run t - #undef NO_CRYPT_TEST - #undef NO_CRYPT_BENCHMARK ``` +## Tested Configurations +- SHA-1 +- SHA-256 +- AES CBC +- ECC sign/verify/shared secret with fast math library ## Setup ### Setting up the SDK with wolfSSL @@ -80,7 +85,7 @@ RANLIB=$RISCV_PATH/bin/riscv64-unknown-elf-gcc-ranlib \ LD=riscv64-unknown-elf-ld \ CXX=riscv64-unknown-elf-g++ \ --disable-examples --enable-static --disable-shared \ -CFLAGS="-march=rv32imac -mabi=ilp32 -mcmodel=medlow -ffunction-sections -fdata-sections -I~/freedom-e-sdk/bsp/sifive-hifive1/install/include -O0 -g -DNO_FILESYSTEM -DWOLFSSL_NO_SOCK -DNO_WRITEV -DWOLFCRYPT_ONLY -DWOLFSSL_GENSEED_FORTEST -DWOLFSSL_SIFIVE_RISC_V" +CFLAGS="-march=rv32imac -mabi=ilp32 -mcmodel=medlow -ffunction-sections -fdata-sections -I~/freedom-e-sdk/bsp/sifive-hifive1/install/include -O0 -g -DNO_FILESYSTEM -DWOLFSSL_NO_SOCK -DNO_WRITEV -DWOLFCRYPT_ONLY -DWOLFSSL_SIFIVE_RISC_V" $make $sudo make install @@ -143,8 +148,6 @@ ECDHE 256 agree 2 ops took 22.000 sec, avg 11000.000 ms, 0.091 ops/ ECDSA 256 sign 2 ops took 23.000 sec, avg 11500.000 ms, 0.087 ops/sec ECDSA 256 verify 2 ops took 45.000 sec, avg 22500.000 ms, 0.044 ops/sec Benchmark complete - - ``` TARGET=sifive-hifive1 ``` @@ -175,6 +178,11 @@ ECDSA 256 sign 2 ops took 25.000 sec, avg 12500.000 ms, 0.080 ops/ ECDSA 256 verify 2 ops took 48.000 sec, avg 24000.000 ms, 0.042 ops/sec Benchmark complete ``` +## Known Caveats +- If you find the wolfcrypt test stuck on early_trap_vector error, it is like related to memory issues +- Using the `__stack_size` default value of 0x400 will not be enough for the ECC test to pass. +The `IDE/ECLIPSE/SIFIVE/Makefile` overwrites the value with 0x1000 (4 KBytes) +- Enabling RSA will cause the ECC test to fail due to memory shortage ## References The test results were collected from a SiFive reference platform target with the following hardware, software and tool chains: diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index d304d77a6..a1528a32c 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -46,6 +46,7 @@ double current_time(int reset) } #endif +#if WOLFSSL_SIFIVE_RISC_V_DEBUG void check(int depth) { char ch; char *ptr = malloc(1); @@ -55,6 +56,7 @@ void check(int depth) { return; check(depth-1); + free(ptr); } void mtime_sleep( uint64_t ticks) { @@ -71,6 +73,45 @@ void delay(int sec) { uint64_t ticks = sec * RTC_FREQ; mtime_sleep(ticks); } +#endif + +/* RNG CODE */ +/* TODO: Implement real RNG */ +static unsigned int gCounter; +unsigned int hw_rand(void) +{ + /* #warning Must implement your own random source */ + + return ++gCounter; +} + +unsigned int my_rng_seed_gen(void) +{ + return hw_rand(); +} + +int my_rng_gen_block(unsigned char* output, unsigned int sz) +{ + uint32_t i = 0; + uint32_t randReturnSize = sizeof(CUSTOM_RAND_TYPE); + + while (i < sz) + { + /* If not aligned or there is odd/remainder */ + if((i + randReturnSize) > sz || + ((uint32_t)&output[i] % randReturnSize) != 0 ) { + /* Single byte at a time */ + output[i++] = (unsigned char)my_rng_seed_gen(); + } + else { + /* Use native 8, 16, 32 or 64 copy instruction */ + *((CUSTOM_RAND_TYPE*)&output[i]) = my_rng_seed_gen(); + i += randReturnSize; + } + } + + return 0; +} int main(void) { @@ -78,7 +119,7 @@ int main(void) #if WOLFSSL_SIFIVE_RISC_V_DEBUG printf("check stack and heap addresses\n"); - check(10); + check(8); printf("sleep for 10 seconds to verify timer\n"); delay(10); printf("awake after sleeping for 10 seconds\n"); @@ -87,9 +128,7 @@ int main(void) #ifdef DEBUG_WOLFSSL wolfSSL_Debugging_ON(); #endif - #ifdef HAVE_STACK_SIZE - StackSizeCheck(&args, server_test); - #endif + if ((ret = wolfCrypt_Init()) != 0) { printf("wolfCrypt_Init failed %d\n", ret); return -1; diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h index c2f66b4d7..0d6c31c82 100644 --- a/IDE/ECLIPSE/SIFIVE/user_settings.h +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -118,11 +118,11 @@ extern "C" { #ifdef ECC_USER_CURVES /* Manual Curve Selection */ - //#define HAVE_ECC192 - //#define HAVE_ECC224 + #define HAVE_ECC192 + #define HAVE_ECC224 #undef NO_ECC256 - //#define HAVE_ECC384 - //#define HAVE_ECC521 + #define HAVE_ECC384 + #define HAVE_ECC521 #endif /* Fixed point cache (speeds repeated operations against same private key) */ @@ -203,29 +203,24 @@ extern "C" { #if 1 #undef HAVE_AES_CBC #define HAVE_AES_CBC + + #undef HAVE_AESGCM + #define HAVE_AESGCM - /* If you need other than AES-CBC mode, you must undefine WOLFSSL_CRYPTOCELL_AES */ - #if !defined(WOLFSSL_CRYPTOCELL_AES) - #undef HAVE_AESGCM - #define HAVE_AESGCM + /* GCM Method: GCM_SMALL, GCM_WORD32 or GCM_TABLE */ + #define GCM_SMALL - /* GCM Method: GCM_SMALL, GCM_WORD32 or GCM_TABLE */ - #define GCM_SMALL + #undef WOLFSSL_AES_DIRECT + //#define WOLFSSL_AES_DIRECT - #undef WOLFSSL_AES_DIRECT - //#define WOLFSSL_AES_DIRECT + #undef HAVE_AES_ECB + //#define HAVE_AES_ECB - #undef HAVE_AES_ECB - //#define HAVE_AES_ECB + #undef WOLFSSL_AES_COUNTER + //#define WOLFSSL_AES_COUNTER - #undef WOLFSSL_AES_COUNTER - //#define WOLFSSL_AES_COUNTER - - #undef HAVE_AESCCM - //#define HAVE_AESCCM - #endif -#else - #define NO_AES + #undef HAVE_AESCCM + //#define HAVE_AESCCM #endif @@ -436,16 +431,22 @@ extern "C" { /* RNG */ /* ------------------------------------------------------------------------- */ -#if defined(WOLFSSL_SIFIVE_RISC_V) - /* Override P-RNG with HW RNG */ - //extern int my_random_generate(byte* output, word32 sz); - //#undef CUSTOM_RAND_GENERATE_BLOCK - //#define CUSTOM_RAND_GENERATE_BLOCK my_random_generate - #define WOLFSSL_GENSEED_FORTEST /* for software RNG*/ +#if 1 +/* Bypass P-RNG and use only HW RNG */ +#define CUSTOM_RAND_TYPE unsigned int +extern int my_rng_gen_block(unsigned char* output, unsigned int sz); +#undef CUSTOM_RAND_GENERATE_BLOCK +#define CUSTOM_RAND_GENERATE_BLOCK my_rng_gen_block #else - #define WOLFSSL_GENSEED_FORTEST -#endif + #define HAVE_HASHDRBG + /* Seed Source */ + /* Size of returned HW RNG value */ + #define CUSTOM_RAND_TYPE unsigned int + extern unsigned int my_rng_seed_gen(void); + #undef CUSTOM_RAND_GENERATE + #define CUSTOM_RAND_GENERATE my_rng_seed_gen +#endif /* ------------------------------------------------------------------------- */ /* Enable Features */ diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c index 72c74e7f7..7cff6040f 100644 --- a/wolfcrypt/src/random.c +++ b/wolfcrypt/src/random.c @@ -2327,13 +2327,11 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) #endif #ifdef USE_TEST_GENSEED -#ifndef WOLFSSL_SIFIVE_RISC_V #ifndef _MSC_VER #warning "write a real random seed!!!!, just for testing now" #else #pragma message("Warning: write a real random seed!!!!, just for testing now") #endif -#endif /* !WOLFSSL_SIFIVE_RISC_V*/ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) { word32 i; From fea89c52f49d7a4c92d66d74b4252349bba438a5 Mon Sep 17 00:00:00 2001 From: Tesfa Mael Date: Wed, 29 May 2019 10:51:07 -0700 Subject: [PATCH 04/21] configure with ECC256 only --- IDE/ECLIPSE/SIFIVE/README.md | 12 ++++++------ IDE/ECLIPSE/SIFIVE/main.c | 19 +++++++++++++++---- IDE/ECLIPSE/SIFIVE/user_settings.h | 10 +++++----- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index bd13c7667..4876a937b 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -15,12 +15,6 @@ The `IDE/ECLIPSE/SIFIVE/main.c` example application provides a function to run t - #undef NO_CRYPT_TEST - #undef NO_CRYPT_BENCHMARK ``` -## Tested Configurations -- SHA-1 -- SHA-256 -- AES CBC -- ECC sign/verify/shared secret with fast math library - ## Setup ### Setting up the SDK with wolfSSL 1. Download the wolfSSL source code or a zip file from GitHub and place it under your SDK `$HOME` directory. You can also copy or simlink to the source. @@ -178,6 +172,12 @@ ECDSA 256 sign 2 ops took 25.000 sec, avg 12500.000 ms, 0.080 ops/ ECDSA 256 verify 2 ops took 48.000 sec, avg 24000.000 ms, 0.042 ops/sec Benchmark complete ``` +## Tested Configurations +- SHA-1 +- SHA-256 +- AES CBC +- ECC 256 sign/verify/shared secret with fast math library + ## Known Caveats - If you find the wolfcrypt test stuck on early_trap_vector error, it is like related to memory issues - Using the `__stack_size` default value of 0x400 will not be enough for the ECC test to pass. diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index a1528a32c..ec3a975ee 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -33,16 +33,27 @@ /*-specs=nano.specs doesn’t include support for floating point in printf()*/ asm (".global _printf_float"); -#define RTC_FREQ 32768 +#ifndef RTC_FREQ + #define RTC_FREQ 32768 +#endif + #define CLINT_MTIME_ADDR 0x200bff8 #define WOLFSSL_SIFIVE_RISC_V_DEBUG 0 +unsigned long get_cpu_freq(void) +{ + /* If clocking up the CPU, you need to add a logic to measure cpu freq */ + + return RTC_FREQ; +} + double current_time(int reset) { volatile uint64_t * mtime = (uint64_t*) (CLINT_MTIME_ADDR); uint64_t now = *mtime; (void)reset; - return now/RTC_FREQ; + /**/ + return now/get_cpu_freq(); } #endif @@ -70,7 +81,7 @@ void mtime_sleep( uint64_t ticks) { } void delay(int sec) { - uint64_t ticks = sec * RTC_FREQ; + uint64_t ticks = sec * get_cpu_freq(); mtime_sleep(ticks); } #endif @@ -120,7 +131,7 @@ int main(void) #if WOLFSSL_SIFIVE_RISC_V_DEBUG printf("check stack and heap addresses\n"); check(8); - printf("sleep for 10 seconds to verify timer\n"); + printf("sleep for 10 seconds to verify timer, measure using a stopwatch\n"); delay(10); printf("awake after sleeping for 10 seconds\n"); #endif diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h index 0d6c31c82..c48ee633f 100644 --- a/IDE/ECLIPSE/SIFIVE/user_settings.h +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -117,12 +117,12 @@ extern "C" { #define ECC_USER_CURVES #ifdef ECC_USER_CURVES - /* Manual Curve Selection */ - #define HAVE_ECC192 - #define HAVE_ECC224 + /* Manual Curve Selection, FP_MAX_BITS must be adjusted accordingly */ + // #define HAVE_ECC192 + // #define HAVE_ECC224 #undef NO_ECC256 - #define HAVE_ECC384 - #define HAVE_ECC521 + // #define HAVE_ECC384 + // #define HAVE_ECC521 #endif /* Fixed point cache (speeds repeated operations against same private key) */ From af9d82963e02ec31d7eb9b336e5f171bb1209b79 Mon Sep 17 00:00:00 2001 From: David Garske Date: Wed, 29 May 2019 12:11:16 -0700 Subject: [PATCH 05/21] Add support for increasing CPU clock speed. New benchmarks with HiFive1 RevB hardware at 320MHz. --- IDE/ECLIPSE/SIFIVE/Makefile | 1 + IDE/ECLIPSE/SIFIVE/README.md | 92 +++++++++++++++--------------------- IDE/ECLIPSE/SIFIVE/main.c | 73 ++++++++++++++++------------ 3 files changed, 82 insertions(+), 84 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile index 81f2620b6..76f372743 100644 --- a/IDE/ECLIPSE/SIFIVE/Makefile +++ b/IDE/ECLIPSE/SIFIVE/Makefile @@ -3,6 +3,7 @@ PROGRAM ?= wolfcrypt # This line must be added in your freedom-e-sdk/scripts/standalone.mk # RISCV_CFLAGS += -I$(WOLFSSL_SRC_DIR) -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE -DWOLFSSL_USER_SETTINGS # WOLFSSL_SRC_DIR variable must be set in the environment when GNU make is started. +# export WOLFSSL_SRC_DIR=~/freedom-e-sdk/software/wolfssl WOLFSSL_CFLAGS += -I$(WOLFSSL_SRC_DIR) \ -I$(WOLFSSL_SRC_DIR)/IDE/ECLIPSE/SIFIVE \ diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index 4876a937b..f590397ca 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -1,4 +1,5 @@ -# SiFive RISC-V HiFive Port +# SiFive RISC-V HiFive1 Port + ## Overview You can enable the wolfSSL support for RISC-V using the `#define WOLFSSL_SIFIVE_RISC_V`. @@ -15,6 +16,7 @@ The `IDE/ECLIPSE/SIFIVE/main.c` example application provides a function to run t - #undef NO_CRYPT_TEST - #undef NO_CRYPT_BENCHMARK ``` + ## Setup ### Setting up the SDK with wolfSSL 1. Download the wolfSSL source code or a zip file from GitHub and place it under your SDK `$HOME` directory. You can also copy or simlink to the source. @@ -87,10 +89,16 @@ $sudo make install You can now build and link your software to the wolfSSL libwolfssl.a static library. ### `wolfcrypt_test()` + wolfcrypt_test() prints a message on the target console similar to the following output: + ``` -wolfCrypt Test Started +SiFive HiFive1 Demo +Setting clock to 320MHz +Actual Clock 320MHz + error test passed! +MEMORY test passed! base64 test passed! asn test passed! SHA test passed! @@ -109,73 +117,46 @@ ECC buffer test passed! logging test passed! mutex test passed! Test complete -... -wolfCrypt Test Completed ``` ### `benchmark_test()` + benchmark_test() prints a message on the target console similar to the following output. + TARGET=sifive-hifive1-revb: + ``` ------------------------------------------------------------------------------ wolfSSL version 4.0.0 ------------------------------------------------------------------------------ wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) -RNG 25 KB took 3.000 seconds, 8.333 KB/s -AES-128-CBC-enc 25 KB took 16.000 seconds, 1.562 KB/s -AES-128-CBC-dec 25 KB took 17.000 seconds, 1.471 KB/s -AES-192-CBC-enc 25 KB took 19.000 seconds, 1.316 KB/s -AES-192-CBC-dec 25 KB took 18.000 seconds, 1.389 KB/s -AES-256-CBC-enc 25 KB took 20.000 seconds, 1.250 KB/s -AES-256-CBC-dec 25 KB took 21.000 seconds, 1.190 KB/s -AES-128-GCM-enc 25 KB took 30.000 seconds, 0.833 KB/s -AES-128-GCM-dec 25 KB took 30.000 seconds, 0.833 KB/s -AES-192-GCM-enc 25 KB took 32.000 seconds, 0.781 KB/s -AES-192-GCM-dec 25 KB took 32.000 seconds, 0.781 KB/s -AES-256-GCM-enc 25 KB took 34.000 seconds, 0.735 KB/s -AES-256-GCM-dec 25 KB took 34.000 seconds, 0.735 KB/s -SHA 50 KB took 1.000 seconds, 50.000 KB/s -SHA-256 25 KB took 1.000 seconds, 25.000 KB/s -HMAC-SHA 50 KB took 1.000 seconds, 50.000 KB/s -HMAC-SHA256 25 KB took 1.000 seconds, 25.000 KB/s -ECC 256 key gen 1 ops took 11.000 sec, avg 11000.000 ms, 0.091 ops/sec -ECDHE 256 agree 2 ops took 22.000 sec, avg 11000.000 ms, 0.091 ops/sec -ECDSA 256 sign 2 ops took 23.000 sec, avg 11500.000 ms, 0.087 ops/sec -ECDSA 256 verify 2 ops took 45.000 sec, avg 22500.000 ms, 0.044 ops/sec -Benchmark complete -``` -TARGET=sifive-hifive1 -``` ------------------------------------------------------------------------------- - wolfSSL version 4.0.0 ------------------------------------------------------------------------------- -wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) -RNG 25 KB took 2.000 seconds, 12.500 KB/s -AES-128-CBC-enc 25 KB took 17.000 seconds, 1.471 KB/s -AES-128-CBC-dec 25 KB took 17.000 seconds, 1.471 KB/s -AES-192-CBC-enc 25 KB took 18.000 seconds, 1.389 KB/s -AES-192-CBC-dec 25 KB took 18.000 seconds, 1.389 KB/s -AES-256-CBC-enc 25 KB took 20.000 seconds, 1.250 KB/s -AES-256-CBC-dec 25 KB took 20.000 seconds, 1.250 KB/s -AES-128-GCM-enc 25 KB took 31.000 seconds, 0.806 KB/s -AES-128-GCM-dec 25 KB took 30.000 seconds, 0.833 KB/s -AES-192-GCM-enc 25 KB took 33.000 seconds, 0.758 KB/s -AES-192-GCM-dec 25 KB took 33.000 seconds, 0.758 KB/s -AES-256-GCM-enc 25 KB took 34.000 seconds, 0.735 KB/s -AES-256-GCM-dec 25 KB took 35.000 seconds, 0.714 KB/s -SHA 50 KB took 1.000 seconds, 50.000 KB/s -SHA-256 25 KB took 1.000 seconds, 25.000 KB/s -HMAC-SHA 25 KB took 1.000 seconds, 25.000 KB/s -HMAC-SHA256 25 KB took 1.000 seconds, 25.000 KB/s -ECC 256 key gen 1 ops took 12.000 sec, avg 12000.000 ms, 0.083 ops/sec -ECDHE 256 agree 2 ops took 24.000 sec, avg 12000.000 ms, 0.083 ops/sec -ECDSA 256 sign 2 ops took 25.000 sec, avg 12500.000 ms, 0.080 ops/sec -ECDSA 256 verify 2 ops took 48.000 sec, avg 24000.000 ms, 0.042 ops/sec +RNG 12 MB took 1.000 seconds, 11.666 MB/s +AES-128-CBC-enc 50 KB took 1.659 seconds, 30.131 KB/s +AES-128-CBC-dec 50 KB took 1.657 seconds, 30.183 KB/s +AES-192-CBC-enc 50 KB took 1.839 seconds, 27.189 KB/s +AES-192-CBC-dec 50 KB took 1.836 seconds, 27.230 KB/s +AES-256-CBC-enc 25 KB took 1.010 seconds, 24.759 KB/s +AES-256-CBC-dec 25 KB took 1.008 seconds, 24.791 KB/s +AES-128-GCM-enc 25 KB took 1.508 seconds, 16.576 KB/s +AES-128-GCM-dec 25 KB took 1.510 seconds, 16.559 KB/s +AES-192-GCM-enc 25 KB took 1.605 seconds, 15.573 KB/s +AES-192-GCM-dec 25 KB took 1.607 seconds, 15.558 KB/s +AES-256-GCM-enc 25 KB took 1.699 seconds, 14.716 KB/s +AES-256-GCM-dec 25 KB took 1.700 seconds, 14.702 KB/s +SHA 2 MB took 1.014 seconds, 1.589 MB/s +SHA-256 425 KB took 1.009 seconds, 421.068 KB/s +HMAC-SHA 1 MB took 1.013 seconds, 1.325 MB/s +HMAC-SHA256 425 KB took 1.018 seconds, 417.420 KB/s +ECC 256 key gen 2 ops took 1.393 sec, avg 696.503 ms, 1.436 ops/sec +ECDHE 256 agree 2 ops took 1.386 sec, avg 692.917 ms, 1.443 ops/sec +ECDSA 256 sign 2 ops took 1.406 sec, avg 703.064 ms, 1.422 ops/sec +ECDSA 256 verify 2 ops took 2.773 sec, avg 1386.597 ms, 0.721 ops/sec Benchmark complete ``` + ## Tested Configurations - SHA-1 - SHA-256 -- AES CBC +- AES CBC/GCM - ECC 256 sign/verify/shared secret with fast math library ## Known Caveats @@ -185,6 +166,7 @@ The `IDE/ECLIPSE/SIFIVE/Makefile` overwrites the value with 0x1000 (4 KBytes) - Enabling RSA will cause the ECC test to fail due to memory shortage ## References + The test results were collected from a SiFive reference platform target with the following hardware, software and tool chains: - HiFive1 Rev A/Rev B: HiFive1 Development Board with the Freedom Everywhere SoC, E300 - freedom-e-sdk diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index ec3a975ee..4b0e3e7a2 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -28,34 +28,33 @@ #include #include +#ifndef __METAL_MACHINE_HEADER +#define __METAL_MACHINE_HEADER "../../../../bsp/sifive-hifive1-revb/metal.h" +#endif +#include + #ifndef NO_CRYPT_BENCHMARK /*-specs=nano.specs doesn’t include support for floating point in printf()*/ asm (".global _printf_float"); #ifndef RTC_FREQ - #define RTC_FREQ 32768 +#define RTC_FREQ 32768UL #endif -#define CLINT_MTIME_ADDR 0x200bff8 +/* CLINT Registers (Core Local Interruptor) for time */ +#define CLINT_BASE 0x02000000UL +#define CLINT_REG_MTIME (*((volatile uint32_t *)(CLINT_BASE + 0xBFF8))) + #define WOLFSSL_SIFIVE_RISC_V_DEBUG 0 -unsigned long get_cpu_freq(void) -{ - /* If clocking up the CPU, you need to add a logic to measure cpu freq */ - - return RTC_FREQ; -} - double current_time(int reset) { - volatile uint64_t * mtime = (uint64_t*) (CLINT_MTIME_ADDR); - uint64_t now = *mtime; + double now = CLINT_REG_MTIME; (void)reset; - /**/ - return now/get_cpu_freq(); + return now/RTC_FREQ; } -#endif +#endif /* !NO_CRYPT_BENCHMARK */ #if WOLFSSL_SIFIVE_RISC_V_DEBUG void check(int depth) { @@ -63,28 +62,27 @@ void check(int depth) { char *ptr = malloc(1); printf("stack at %p, heap at %p\n", &ch, ptr); - if (depth <= 0) + if (depth <= 0) return; - + check(depth-1); free(ptr); } -void mtime_sleep( uint64_t ticks) { - volatile uint64_t * mtime = (uint64_t*) (CLINT_MTIME_ADDR); - uint64_t now = *mtime; +void mtime_sleep(uint64_t ticks) { + uint64_t now = CLINT_REG_MTIME; uint64_t then = now + ticks; while((*mtime - now) < ticks) { - + } } void delay(int sec) { - uint64_t ticks = sec * get_cpu_freq(); + uint64_t ticks = sec * RTC_FREQ; mtime_sleep(ticks); } -#endif +#endif /* WOLFSSL_SIFIVE_RISC_V_DEBUG */ /* RNG CODE */ /* TODO: Implement real RNG */ @@ -124,9 +122,16 @@ int my_rng_gen_block(unsigned char* output, unsigned int sz) return 0; } -int main(void) + +#if !defined(NO_CLOCK_SPEEDUP) && !defined(USE_CLOCK_HZ) + /* 320MHz */ + #define USE_CLOCK_HZ 320000000UL +#endif + +int main(void) { int ret; + long clkHz = 16000000; /* default */ #if WOLFSSL_SIFIVE_RISC_V_DEBUG printf("check stack and heap addresses\n"); @@ -134,11 +139,21 @@ int main(void) printf("sleep for 10 seconds to verify timer, measure using a stopwatch\n"); delay(10); printf("awake after sleeping for 10 seconds\n"); -#endif - - #ifdef DEBUG_WOLFSSL - wolfSSL_Debugging_ON(); - #endif +#endif + +#ifdef USE_CLOCK_HZ + /* Speed up clock */ + printf("SiFive HiFive1 Demo\n"); + printf("Setting clock to %dMHz\n", USE_CLOCK_HZ/1000000); + clkHz = metal_clock_set_rate_hz( + &__METAL_DT_SIFIVE_FE310_G000_PLL_HANDLE->clock, USE_CLOCK_HZ + ); +#endif + printf("Actual Clock %dMHz\n", clkHz/1000000); + +#ifdef DEBUG_WOLFSSL + wolfSSL_Debugging_ON(); +#endif if ((ret = wolfCrypt_Init()) != 0) { printf("wolfCrypt_Init failed %d\n", ret); @@ -156,10 +171,10 @@ int main(void) benchmark_test(NULL); printf("\nBenchmark Test Completed\n"); #endif + if ((ret = wolfCrypt_Cleanup()) != 0) { printf("wolfCrypt_Cleanup failed %d\n", ret); return -1; } return 0; } - From e01ae0980969491efb7a3178a3e917dea7445524 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 31 May 2019 10:12:46 +1000 Subject: [PATCH 06/21] X25519 when not AVX2 --- wolfcrypt/src/fe_x25519_asm.S | 108 +++++++++++++++++----------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index c4d2075eb..a0f57c5a2 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -2045,56 +2045,6 @@ L_curve25519_x64_bits: xorq %r11, 56(%rsp) movq %rbp, %rbx # Add - movq 64(%rsp), %rcx - movq 72(%rsp), %r9 - movq 80(%rsp), %r10 - movq 88(%rsp), %rbp - movq %rcx, %r12 - addq 32(%rsp), %rcx - movq %r9, %r13 - adcq 40(%rsp), %r9 - movq %r10, %r14 - adcq 48(%rsp), %r10 - movq %rbp, %r15 - adcq 56(%rsp), %rbp - movq $-19, %rax - movq %rbp, %r11 - movq $0x7fffffffffffffff, %rdx - sarq $63, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Sub modulus (if overflow) - subq %rax, %rcx - sbbq %rbp, %r9 - sbbq %rbp, %r10 - sbbq %rdx, %r11 - # Sub - subq 32(%rsp), %r12 - movq $0x00, %rbp - sbbq 40(%rsp), %r13 - movq $-19, %rax - sbbq 48(%rsp), %r14 - movq $0x7fffffffffffffff, %rdx - sbbq 56(%rsp), %r15 - sbbq $0x00, %rbp - # Mask the modulus - andq %rbp, %rax - andq %rbp, %rdx - # Add modulus (if underflow) - addq %rax, %r12 - adcq %rbp, %r13 - adcq %rbp, %r14 - adcq %rdx, %r15 - movq %rcx, (%rdi) - movq %r9, 8(%rdi) - movq %r10, 16(%rdi) - movq %r11, 24(%rdi) - movq %r12, 96(%rsp) - movq %r13, 104(%rsp) - movq %r14, 112(%rsp) - movq %r15, 120(%rsp) - # Add movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 @@ -2136,14 +2086,64 @@ L_curve25519_x64_bits: adcq %rbp, %r13 adcq %rbp, %r14 adcq %rdx, %r15 - movq %rcx, (%rsp) - movq %r9, 8(%rsp) - movq %r10, 16(%rsp) - movq %r11, 24(%rsp) + movq %rcx, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) + # Add + movq 64(%rsp), %rcx + movq 72(%rsp), %r9 + movq 80(%rsp), %r10 + movq 88(%rsp), %rbp + movq %rcx, %r12 + addq 32(%rsp), %rcx + movq %r9, %r13 + adcq 40(%rsp), %r9 + movq %r10, %r14 + adcq 48(%rsp), %r10 + movq %rbp, %r15 + adcq 56(%rsp), %rbp + movq $-19, %rax + movq %rbp, %r11 + movq $0x7fffffffffffffff, %rdx + sarq $63, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Sub modulus (if overflow) + subq %rax, %rcx + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rdx, %r11 + # Sub + subq 32(%rsp), %r12 + movq $0x00, %rbp + sbbq 40(%rsp), %r13 + movq $-19, %rax + sbbq 48(%rsp), %r14 + movq $0x7fffffffffffffff, %rdx + sbbq 56(%rsp), %r15 + sbbq $0x00, %rbp + # Mask the modulus + andq %rbp, %rax + andq %rbp, %rdx + # Add modulus (if underflow) + addq %rax, %r12 + adcq %rbp, %r13 + adcq %rbp, %r14 + adcq %rdx, %r15 + movq %rcx, (%rsp) + movq %r9, 8(%rsp) + movq %r10, 16(%rsp) + movq %r11, 24(%rsp) + movq %r12, 96(%rsp) + movq %r13, 104(%rsp) + movq %r14, 112(%rsp) + movq %r15, 120(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rax From 4f80c7c94e692f265568e9071e7f12a47f8a1db6 Mon Sep 17 00:00:00 2001 From: Tesfa Mael Date: Tue, 4 Jun 2019 13:12:27 -0700 Subject: [PATCH 07/21] Allow main to build when local debug flag is on --- IDE/ECLIPSE/SIFIVE/main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index 4b0e3e7a2..99369e33e 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -69,17 +69,16 @@ void check(int depth) { free(ptr); } -void mtime_sleep(uint64_t ticks) { - uint64_t now = CLINT_REG_MTIME; - uint64_t then = now + ticks; +void mtime_sleep(uint32_t ticks) { + uint32_t start = CLINT_REG_MTIME; - while((*mtime - now) < ticks) { + while((CLINT_REG_MTIME - start) < ticks) { } } -void delay(int sec) { - uint64_t ticks = sec * RTC_FREQ; +void delay(uint32_t sec) { + uint32_t ticks = sec * RTC_FREQ; mtime_sleep(ticks); } #endif /* WOLFSSL_SIFIVE_RISC_V_DEBUG */ From 074e770c98796f1a8cea21338eb1758e1fc34ba1 Mon Sep 17 00:00:00 2001 From: David Garske Date: Thu, 30 May 2019 16:14:51 -0700 Subject: [PATCH 08/21] * Fix to ensure user_settings.h can be included for bio.c and evp.c when IDE or compiler tries to build it directly. Allows for wildcard .c include along with `WOLFSSL_IGNORE_FILE_WARN`. * Fix for building SP cortex M without RSA. * Fix for type-cast warning with STSAFE-A100. * Improved the RNG seed test to prevent type-case warning. --- src/bio.c | 2 ++ wolfcrypt/src/evp.c | 2 ++ wolfcrypt/src/port/st/stsafe.c | 4 ++-- wolfcrypt/src/sp_cortexm.c | 26 +++++++++++++------------- wolfcrypt/test/test.c | 5 +++-- 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/bio.c b/src/bio.c index d8349801b..c4b225759 100644 --- a/src/bio.c +++ b/src/bio.c @@ -19,6 +19,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +#include + #if !defined(WOLFSSL_BIO_INCLUDED) #ifndef WOLFSSL_IGNORE_FILE_WARN #warning bio.c does not need to be compiled separately from ssl.c diff --git a/wolfcrypt/src/evp.c b/wolfcrypt/src/evp.c index f92d4e8d4..3230259bf 100644 --- a/wolfcrypt/src/evp.c +++ b/wolfcrypt/src/evp.c @@ -19,6 +19,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ +#include + #if !defined(WOLFSSL_EVP_INCLUDED) #ifndef WOLFSSL_IGNORE_FILE_WARN #warning evp.c does not need to be compiled seperatly from ssl.c diff --git a/wolfcrypt/src/port/st/stsafe.c b/wolfcrypt/src/port/st/stsafe.c index 9b5e7503c..f96316a9f 100644 --- a/wolfcrypt/src/port/st/stsafe.c +++ b/wolfcrypt/src/port/st/stsafe.c @@ -39,7 +39,7 @@ int SSL_STSAFE_LoadDeviceCertificate(byte** pRawCertificate, /* Try reading device certificate from ST-SAFE Zone 0 */ err = stsafe_interface_read_device_certificate_raw( - pRawCertificate, pRawCertificateLen); + pRawCertificate, (uint32_t*)pRawCertificateLen); if (err == 0) { #if 0 /* example for loading into WOLFSSL_CTX */ @@ -154,7 +154,7 @@ int SSL_STSAFE_VerifyPeerCertCb(WOLFSSL* ssl, if (err == 0) { /* Verify signature */ err = stsafe_interface_verify(curve_id, (uint8_t*)hash, sigRS, - pubKeyX, pubKeyY, result); + pubKeyX, pubKeyY, (int32_t*)result); } wc_ecc_free(&key); diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index 7e3e90dcc..6a5b9861a 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -39,10 +39,6 @@ defined(WOLFSSL_HAVE_SP_ECC) #ifdef RSA_LOW_MEM -#ifndef SP_RSA_PRIVATE_EXP_D -#define SP_RSA_PRIVATE_EXP_D -#endif - #ifndef WOLFSSL_SP_SMALL #define WOLFSSL_SP_SMALL #endif @@ -3670,7 +3666,7 @@ static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH) && !WOLFSSL_RSA_PUBLIC_ONLY */ -#ifdef WOLFSSL_HAVE_SP_DH +#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 2048 bits, just need to subtract. * @@ -3685,7 +3681,8 @@ static void sp_2048_mont_norm_64(sp_digit* r, sp_digit* m) sp_2048_sub_in_place_64(r, m); } -#endif /* WOLFSSL_HAVE_SP_DH */ +#endif /* WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH */ + /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -4072,8 +4069,8 @@ static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, sp_digit* a, sp_digit* m) return sp_2048_div_64_cond(a, m, NULL, r); } -#if (defined(SP_RSA_PRIVATE_EXP_D) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ - defined(WOLFSSL_HAVE_SP_DH) +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ + defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL /* Modular exponentiate a to the e mod m. (r = a^e mod m) * @@ -4346,7 +4343,7 @@ static int sp_2048_mod_exp_64(sp_digit* r, sp_digit* a, sp_digit* e, return err; } #endif /* WOLFSSL_SP_SMALL */ -#endif /* (SP_RSA_PRIVATE_EXP_D && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. @@ -9134,7 +9131,8 @@ static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e, #endif /* (WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH) && !WOLFSSL_RSA_PUBLIC_ONLY */ -#ifdef WOLFSSL_HAVE_SP_DH +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ + defined(WOLFSSL_HAVE_SP_DH) /* r = 2^n mod m where n is the number of bits to reduce by. * Given m must be 3072 bits, just need to subtract. * @@ -9149,7 +9147,9 @@ static void sp_3072_mont_norm_96(sp_digit* r, sp_digit* m) sp_3072_sub_in_place_96(r, m); } -#endif /* WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ + + /* Conditionally subtract b from a using the mask m. * m is -1 to subtract and 0 when not copying. * @@ -9542,7 +9542,7 @@ static WC_INLINE int sp_3072_mod_96_cond(sp_digit* r, sp_digit* a, sp_digit* m) return sp_3072_div_96_cond(a, m, NULL, r); } -#if (defined(SP_RSA_PRIVATE_EXP_D) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ +#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \ defined(WOLFSSL_HAVE_SP_DH) #ifdef WOLFSSL_SP_SMALL /* Modular exponentiate a to the e mod m. (r = a^e mod m) @@ -9816,7 +9816,7 @@ static int sp_3072_mod_exp_96(sp_digit* r, sp_digit* a, sp_digit* e, return err; } #endif /* WOLFSSL_SP_SMALL */ -#endif /* (SP_RSA_PRIVATE_EXP_D && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ +#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */ #ifdef WOLFSSL_HAVE_SP_RSA /* RSA public key operation. diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 9e29cf59e..591213f42 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -23702,7 +23702,8 @@ static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx) } else if (info->algo_type == WC_ALGO_TYPE_SEED) { #ifndef WC_NO_RNG - static byte seed[] = { 0x00, 0x00, 0x00, 0x01 }; + static byte seed[sizeof(word32)] = { 0x00, 0x00, 0x00, 0x01 }; + word32* seedWord32 = (word32*)seed; word32 len; /* wc_GenerateSeed is a local symbol so we need to fake the entropy. */ @@ -23713,7 +23714,7 @@ static int myCryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx) XMEMCPY(info->seed.seed, seed, sizeof(seed)); info->seed.seed += len; info->seed.sz -= len; - (*((word32*)seed))++; + (*seedWord32)++; } ret = 0; From 9f9155b6f2ed148adf9422627fb9537b302b0526 Mon Sep 17 00:00:00 2001 From: David Garske Date: Tue, 4 Jun 2019 15:55:15 -0700 Subject: [PATCH 09/21] Added STSAFE Crypto Callback support. --- wolfcrypt/src/port/st/stsafe.c | 187 +++++++++++++++++++++++++++++ wolfssl/wolfcrypt/port/st/stsafe.h | 28 ++++- 2 files changed, 212 insertions(+), 3 deletions(-) diff --git a/wolfcrypt/src/port/st/stsafe.c b/wolfcrypt/src/port/st/stsafe.c index f96316a9f..93ad35d4d 100644 --- a/wolfcrypt/src/port/st/stsafe.c +++ b/wolfcrypt/src/port/st/stsafe.c @@ -325,4 +325,191 @@ int SSL_STSAFE_SetupPkCallbackCtx(WOLFSSL* ssl, void* user_ctx) #endif /* HAVE_PK_CALLBACKS */ +#ifdef WOLF_CRYPTO_CB + +int wolfSSL_STSAFE_CryptoDevCb(int devId, wc_CryptoInfo* info, void* ctx) +{ + int rc = CRYPTOCB_UNAVAILABLE; + wolfSTSAFE_CryptoCb_Ctx* stsCtx = (wolfSTSAFE_CryptoCb_Ctx*)ctx; + + if (info == NULL || ctx == NULL) + return BAD_FUNC_ARG; + + (void)devId; + (void)stsCtx; + + if (info->algo_type == WC_ALGO_TYPE_SEED) { + /* use the STSAFE hardware for RNG seed */ + #if !defined(WC_NO_RNG) && defined(USE_STSAFE_RNG_SEED) + while (info->seed.sz > 0) { + rc = stsafe_interface_getrandom(info->seed.seed, info->seed.sz); + if (rc < 0) { + return rc; + } + info->seed.seed += rc; + info->seed.sz -= rc; + } + rc = 0; + #else + rc = CRYPTOCB_UNAVAILABLE; + #endif + } +#ifdef HAVE_ECC + else if (info->algo_type == WC_ALGO_TYPE_PK) { + #ifdef USE_STSAFE_VERBOSE + printf("STSAFE Pk: Type %d\n", info->pk.type); + #endif + + if (info->pk.type == WC_PK_TYPE_EC_KEYGEN) { + byte pubKeyRaw[STSAFE_MAX_PUBKEY_RAW_LEN]; + StSafeA_KeySlotNumber slot; + StSafeA_CurveId curve_id; + int ecc_curve, key_sz; + + WOLFSSL_MSG("STSAFE: ECC KeyGen"); + + /* get curve */ + ecc_curve = info->pk.eckg.curveId; + curve_id = stsafe_get_ecc_curve_id(ecc_curve); + key_sz = stsafe_get_key_size(curve_id); + + /* generate new ephemeral key on device */ + rc = stsafe_interface_create_key(&slot, curve_id, + (uint8_t*)pubKeyRaw); + if (rc != 0) { + return rc; + } + + /* load generated public key into key, used by wolfSSL */ + rc = wc_ecc_import_unsigned(info->pk.eckg.key, pubKeyRaw, + &pubKeyRaw[key_sz], NULL, ecc_curve); + } + else if (info->pk.type == WC_PK_TYPE_ECDSA_SIGN) { + byte digest[STSAFE_MAX_KEY_LEN]; + byte sigRS[STSAFE_MAX_SIG_LEN]; + byte *r, *s; + StSafeA_CurveId curve_id; + word32 inSz = info->pk.eccsign.inlen; + int key_sz; + + WOLFSSL_MSG("STSAFE: ECC Sign"); + + curve_id = stsafe_get_curve_mode(); + key_sz = stsafe_get_key_size(curve_id); + + /* truncate input to match key size */ + if (inSz > key_sz) + inSz = key_sz; + + /* Build input digest */ + XMEMSET(&digest[0], 0, sizeof(digest)); + XMEMCPY(&digest[key_sz - inSz], info->pk.eccsign.in, inSz); + + /* Sign using slot 0: Result is R then S */ + /* Sign will always use the curve type in slot 0 + (the TLS curve needs to match) */ + XMEMSET(sigRS, 0, sizeof(sigRS)); + rc = stsafe_interface_sign(STSAFE_A_SLOT_0, curve_id, + (uint8_t*)info->pk.eccsign.in, sigRS); + if (rc != 0) { + return rc; + } + + /* Convert R and S to signature */ + r = &sigRS[0]; + s = &sigRS[key_sz]; + rc = wc_ecc_rs_raw_to_sig((const byte*)r, key_sz, (const byte*)s, + key_sz, info->pk.eccsign.out, info->pk.eccsign.outlen); + if (rc != 0) { + WOLFSSL_MSG("Error converting RS to Signature"); + } + } + else if (info->pk.type == WC_PK_TYPE_ECDSA_VERIFY) { + byte sigRS[STSAFE_MAX_SIG_LEN]; + byte *r, *s; + word32 r_len = STSAFE_MAX_SIG_LEN/2, s_len = STSAFE_MAX_SIG_LEN/2; + byte pubKeyX[STSAFE_MAX_PUBKEY_RAW_LEN/2]; + byte pubKeyY[STSAFE_MAX_PUBKEY_RAW_LEN/2]; + word32 pubKeyX_len = sizeof(pubKeyX); + word32 pubKeyY_len = sizeof(pubKeyY); + StSafeA_CurveId curve_id; + int ecc_curve, key_sz; + + WOLFSSL_MSG("STSAFE: ECC Verify"); + + if (info->pk.eccverify.key == NULL) + return BAD_FUNC_ARG; + + /* determine curve */ + ecc_curve = info->pk.eccverify.key->dp->id; + curve_id = stsafe_get_ecc_curve_id(ecc_curve); + key_sz = stsafe_get_key_size(curve_id); + + /* Extract Raw X and Y coordinates of the public key */ + rc = wc_ecc_export_public_raw(info->pk.eccverify.key, + pubKeyX, &pubKeyX_len, + pubKeyY, &pubKeyY_len); + if (rc == 0) { + /* Extract R and S from signature */ + XMEMSET(sigRS, 0, sizeof(sigRS)); + r = &sigRS[0]; + s = &sigRS[key_sz]; + rc = wc_ecc_sig_to_rs(info->pk.eccverify.sig, + info->pk.eccverify.siglen, r, &r_len, s, &s_len); + (void)r_len; + (void)s_len; + } + if (rc == 0) { + /* Verify signature */ + rc = stsafe_interface_verify(curve_id, + (uint8_t*)info->pk.eccverify.hash, sigRS, pubKeyX, pubKeyY, + (int32_t*)info->pk.eccverify.res); + } + } + else if (info->pk.type == WC_PK_TYPE_ECDH) { + byte otherKeyX[STSAFE_MAX_KEY_LEN]; + byte otherKeyY[STSAFE_MAX_KEY_LEN]; + word32 otherKeyX_len = sizeof(otherKeyX); + word32 otherKeyY_len = sizeof(otherKeyY); + StSafeA_CurveId curve_id; + int ecc_curve; + + WOLFSSL_MSG("STSAFE: PMS"); + + if (info->pk.ecdh.public_key == NULL) + return BAD_FUNC_ARG; + + /* get curve */ + ecc_curve = info->pk.ecdh.public_key->dp->id; + curve_id = stsafe_get_ecc_curve_id(ecc_curve); + + /* Export otherKey raw X and Y */ + rc = wc_ecc_export_public_raw(info->pk.ecdh.public_key, + &otherKeyX[0], (word32*)&otherKeyX_len, + &otherKeyY[0], (word32*)&otherKeyY_len); + if (rc == 0) { + /* Compute shared secret */ + *info->pk.ecdh.outlen = 0; + rc = stsafe_interface_shared_secret(curve_id, + otherKeyX, otherKeyY, + info->pk.ecdh.out, (int32_t*)info->pk.ecdh.outlen); + } + } + } +#endif /* HAVE_ECC */ + + /* need to return negative here for error */ + if (rc != 0 && rc != CRYPTOCB_UNAVAILABLE) { + WOLFSSL_MSG("STSAFE: CryptoCb failed"); + #ifdef USE_STSAFE_VERBOSE + printf("STSAFE: CryptoCb failed %d\n", rc); + #endif + rc = WC_HW_E; + } + + return rc; +} + +#endif /* WOLF_CRYPTO_CB */ + #endif /* WOLFSSL_STSAFEA100 */ diff --git a/wolfssl/wolfcrypt/port/st/stsafe.h b/wolfssl/wolfcrypt/port/st/stsafe.h index 4a60470db..e7c451d90 100644 --- a/wolfssl/wolfcrypt/port/st/stsafe.h +++ b/wolfssl/wolfcrypt/port/st/stsafe.h @@ -29,6 +29,8 @@ #ifdef WOLFSSL_STSAFEA100 +/* The wolf STSAFE interface layer */ +/* Please contact wolfSSL for the STSAFE port files */ #include "stsafe_interface.h" #ifndef STSAFE_MAX_KEY_LEN @@ -52,11 +54,11 @@ WOLFSSL_API int SSL_STSAFE_VerifyPeerCertCb(WOLFSSL* ssl, const unsigned char* hash, unsigned int hashSz, const unsigned char* keyDer, unsigned int keySz, int* result, void* ctx); -WOLFSSL_API int SSL_STSAFE_SignCertificateCb(WOLFSSL* ssl, +WOLFSSL_API int SSL_STSAFE_SignCertificateCb(WOLFSSL* ssl, const byte* in, word32 inSz, - byte* out, word32* outSz, + byte* out, word32* outSz, const byte* key, word32 keySz, void* ctx); -WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl, +WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl, ecc_key* otherKey, unsigned char* pubKeyDer, unsigned int* pubKeySz, unsigned char* out, unsigned int* outlen, @@ -65,7 +67,27 @@ WOLFSSL_API int SSL_STSAFE_SharedSecretCb(WOLFSSL* ssl, /* Helper API's for setting up callbacks */ WOLFSSL_API int SSL_STSAFE_SetupPkCallbacks(WOLFSSL_CTX* ctx); WOLFSSL_API int SSL_STSAFE_SetupPkCallbackCtx(WOLFSSL* ssl, void* user_ctx); +#endif /* HAVE_PK_CALLBACKS */ + + +#ifdef WOLF_CRYPTO_CB + +#include + +/* Device ID that's unique and valid (not INVALID_DEVID -2) */ +#define WOLF_STSAFE_DEVID 0x53545341; /* STSA */ + +typedef struct wolfSTSAFE_CryptoCb_Ctx { +#ifdef HAVE_ECC + ecc_key wolfEccKey; #endif + int devId; +} wolfSTSAFE_CryptoCb_Ctx; + +WOLFSSL_API int wolfSSL_STSAFE_CryptoDevCb(int devId, wc_CryptoInfo* info, + void* ctx); + +#endif /* WOLF_CRYPTO_CB */ #endif /* WOLFSSL_STSAFEA100 */ From fdd01c8c8c4828c1d26c031c5cbb1a5098460f81 Mon Sep 17 00:00:00 2001 From: David Garske Date: Wed, 5 Jun 2019 20:21:48 -0700 Subject: [PATCH 10/21] Added support for SHA512, ED25519/CURVE25519 and ChaCha20/Poly1305. Enabled ECC Single Precision (SP) support. Updated README.md wolfCrypt test/benchmarks. Pulled in fix "Fix casting of memory allocation to correct type" from Sean. --- IDE/ECLIPSE/SIFIVE/Makefile | 3 +- IDE/ECLIPSE/SIFIVE/README.md | 76 ++++++++++++++++++++---------- IDE/ECLIPSE/SIFIVE/user_settings.h | 63 +++++++++++++------------ wolfcrypt/src/sp_c32.c | 2 +- wolfcrypt/src/sp_c64.c | 2 +- 5 files changed, 88 insertions(+), 58 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile index 76f372743..b19c86f6a 100644 --- a/IDE/ECLIPSE/SIFIVE/Makefile +++ b/IDE/ECLIPSE/SIFIVE/Makefile @@ -26,7 +26,8 @@ OPT_CFLAGS = -specs=nano.specs # The __stack_size and __heap_size symbols are defined in the linker metal.default.ld # script in the freedom-e-sdk. override CFLAGS += $(OPT_CFLAGS) $(WOLFSSL_CFLAGS) \ - -Xlinker --defsym=__stack_size=0x1000 + -Xlinker --defsym=__stack_size=0x1200 \ + -Xlinker --defsym=__heap_size=0x800 $(PROGRAM): $(SRC) $(CC) $(CFLAGS) $(SRC) $(LDFLAGS) $(LDLIBS) -o $@ diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index f590397ca..385ddceee 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -103,10 +103,15 @@ base64 test passed! asn test passed! SHA test passed! SHA-256 test passed! +SHA-512 test passed! Hash test passed! HMAC-SHA test passed! HMAC-SHA256 test passed! +HMAC-SHA512 test passed! GMAC test passed! +Chacha test passed! +POLY1305 test passed! +ChaCha20-Poly1305 AEAD test passed! AES test passed! AES192 test passed! AES256 test passed! @@ -114,6 +119,8 @@ AES-GCM test passed! RANDOM test passed! ECC test passed! ECC buffer test passed! +CURVE25519 test passed! +ED25519 test passed! logging test passed! mutex test passed! Test complete @@ -125,45 +132,62 @@ benchmark_test() prints a message on the target console similar to the following TARGET=sifive-hifive1-revb: ``` +SiFive HiFive1 Demo +Setting clock to 320MHz +Actual Clock 320MHz + ------------------------------------------------------------------------------ wolfSSL version 4.0.0 ------------------------------------------------------------------------------ wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) -RNG 12 MB took 1.000 seconds, 11.666 MB/s -AES-128-CBC-enc 50 KB took 1.659 seconds, 30.131 KB/s -AES-128-CBC-dec 50 KB took 1.657 seconds, 30.183 KB/s -AES-192-CBC-enc 50 KB took 1.839 seconds, 27.189 KB/s -AES-192-CBC-dec 50 KB took 1.836 seconds, 27.230 KB/s -AES-256-CBC-enc 25 KB took 1.010 seconds, 24.759 KB/s -AES-256-CBC-dec 25 KB took 1.008 seconds, 24.791 KB/s -AES-128-GCM-enc 25 KB took 1.508 seconds, 16.576 KB/s -AES-128-GCM-dec 25 KB took 1.510 seconds, 16.559 KB/s -AES-192-GCM-enc 25 KB took 1.605 seconds, 15.573 KB/s -AES-192-GCM-dec 25 KB took 1.607 seconds, 15.558 KB/s -AES-256-GCM-enc 25 KB took 1.699 seconds, 14.716 KB/s -AES-256-GCM-dec 25 KB took 1.700 seconds, 14.702 KB/s -SHA 2 MB took 1.014 seconds, 1.589 MB/s -SHA-256 425 KB took 1.009 seconds, 421.068 KB/s -HMAC-SHA 1 MB took 1.013 seconds, 1.325 MB/s -HMAC-SHA256 425 KB took 1.018 seconds, 417.420 KB/s -ECC 256 key gen 2 ops took 1.393 sec, avg 696.503 ms, 1.436 ops/sec -ECDHE 256 agree 2 ops took 1.386 sec, avg 692.917 ms, 1.443 ops/sec -ECDSA 256 sign 2 ops took 1.406 sec, avg 703.064 ms, 1.422 ops/sec -ECDSA 256 verify 2 ops took 2.773 sec, avg 1386.597 ms, 0.721 ops/sec +RNG 200 KB took 1.044 seconds, 191.519 KB/s +AES-128-CBC-enc 50 KB took 1.657 seconds, 30.174 KB/s +AES-128-CBC-dec 50 KB took 1.659 seconds, 30.141 KB/s +AES-192-CBC-enc 50 KB took 1.837 seconds, 27.220 KB/s +AES-192-CBC-dec 50 KB took 1.839 seconds, 27.194 KB/s +AES-256-CBC-enc 25 KB took 1.009 seconds, 24.784 KB/s +AES-256-CBC-dec 25 KB took 1.010 seconds, 24.761 KB/s +AES-128-GCM-enc 25 KB took 1.493 seconds, 16.739 KB/s +AES-128-GCM-dec 25 KB took 1.564 seconds, 15.986 KB/s +AES-192-GCM-enc 25 KB took 1.591 seconds, 15.716 KB/s +AES-192-GCM-dec 25 KB took 1.662 seconds, 15.044 KB/s +AES-256-GCM-enc 25 KB took 1.684 seconds, 14.843 KB/s +AES-256-GCM-dec 25 KB took 1.755 seconds, 14.245 KB/s +CHACHA 1 MB took 1.004 seconds, 0.997 MB/s +CHA-POLY 675 KB took 1.021 seconds, 661.060 KB/s +POLY1305 2 MB took 1.007 seconds, 2.230 MB/s +SHA 1 MB took 1.016 seconds, 1.321 MB/s +SHA-256 425 KB took 1.005 seconds, 422.909 KB/s +SHA-512 25 KB took 2.043 seconds, 12.239 KB/s +HMAC-SHA 1 MB took 1.010 seconds, 1.378 MB/s +HMAC-SHA256 425 KB took 1.037 seconds, 409.781 KB/s +HMAC-SHA512 25 KB took 2.075 seconds, 12.050 KB/s +ECC 256 key gen 2 ops took 1.099 sec, avg 549.271 ms, 1.821 ops/sec +ECDHE 256 agree 2 ops took 1.093 sec, avg 546.555 ms, 1.830 ops/sec +ECDSA 256 sign 2 ops took 1.167 sec, avg 583.694 ms, 1.713 ops/sec +ECDSA 256 verify 2 ops took 2.136 sec, avg 1067.795 ms, 0.937 ops/sec +CURVE 25519 key gen 2 ops took 1.693 sec, avg 846.451 ms, 1.181 ops/sec +CURVE 25519 agree 2 ops took 1.689 sec, avg 844.299 ms, 1.184 ops/sec +ED 25519 key gen 1 ops took 1.702 sec, avg 1702.057 ms, 0.588 ops/sec +ED 25519 sign 2 ops took 3.650 sec, avg 1824.753 ms, 0.548 ops/sec +ED 25519 verify 2 ops took 5.788 sec, avg 2894.012 ms, 0.346 ops/sec Benchmark complete ``` ## Tested Configurations -- SHA-1 -- SHA-256 -- AES CBC/GCM -- ECC 256 sign/verify/shared secret with fast math library +- P-RNG (NIST DRBG) with SHA-256 +- SHA 1/256/512 +- AES 128/192/256 CBC/GCM +- ECC 256 sign/verify/shared secret with fast math or Single Precision (SP) library +- ED25519/Curve25519 +- HMAC +- ChaCha20/Poly1305 ## Known Caveats - If you find the wolfcrypt test stuck on early_trap_vector error, it is like related to memory issues - Using the `__stack_size` default value of 0x400 will not be enough for the ECC test to pass. The `IDE/ECLIPSE/SIFIVE/Makefile` overwrites the value with 0x1000 (4 KBytes) -- Enabling RSA will cause the ECC test to fail due to memory shortage +- Enabling RSA will cause the ECC test to fail due to memory shortage. ## References diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h index c48ee633f..249783c40 100644 --- a/IDE/ECLIPSE/SIFIVE/user_settings.h +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -70,10 +70,25 @@ extern "C" { //#define TFM_ARM #endif +/* Wolf Single Precision Math */ +/* Optional ECC SECP256R1 acceleration using optimized C code */ +#undef WOLFSSL_SP +#if 1 + #define WOLFSSL_SP + #define WOLFSSL_SP_SMALL /* use smaller version of code (requires heap) */ + #define SP_WORD_SIZE 32 /* force 32-bit type */ + #define WOLFSSL_SP_MATH /* only SP math - eliminates fast math code */ + //#define WOLFSSL_SP_DIV_32 /* do not use 64-bit divides */ + + #define WOLFSSL_HAVE_SP_ECC + //#define WOLFSSL_HAVE_SP_RSA +#endif + /* ------------------------------------------------------------------------- */ /* Crypto */ /* ------------------------------------------------------------------------- */ /* RSA */ +/* Not enabled due to memory constraints on HiFive1 */ #undef NO_RSA #if 0 #ifdef USE_FAST_MATH @@ -166,17 +181,9 @@ extern "C" { #define FP_MAX_BITS (256 + 32) #else #undef ALT_ECC_SIZE - #define ALT_ECC_SIZE - #endif - - /* Speedups specific to curve */ - #ifndef NO_ECC256 - #undef TFM_ECC256 - //#define TFM_ECC256 - #endif - #ifndef HAVE_ECC384 - #undef TFM_ECC384 - //#define TFM_ECC384 + /* Disable alternate ECC size, since it uses HEAP allocations. + Heap is limited resource on HiFive1 */ + //#define ALT_ECC_SIZE #endif #endif #endif @@ -203,7 +210,7 @@ extern "C" { #if 1 #undef HAVE_AES_CBC #define HAVE_AES_CBC - + #undef HAVE_AESGCM #define HAVE_AESGCM @@ -234,7 +241,7 @@ extern "C" { /* ChaCha20 / Poly1305 */ #undef HAVE_CHACHA #undef HAVE_POLY1305 -#if 0 +#if 1 #define HAVE_CHACHA #define HAVE_POLY1305 @@ -246,12 +253,13 @@ extern "C" { /* Ed25519 / Curve25519 */ #undef HAVE_CURVE25519 #undef HAVE_ED25519 -#if 0 +#if 1 #define HAVE_CURVE25519 #define HAVE_ED25519 /* ED25519 Requires SHA512 */ /* Optionally use small math (less flash usage, but much slower) */ #if 1 + /* Curve and Ed 25519 small */ #define CURVED25519_SMALL #endif #endif @@ -285,7 +293,7 @@ extern "C" { /* Sha512 */ #undef WOLFSSL_SHA512 -#if 0 +#if 1 #define WOLFSSL_SHA512 /* Sha384 */ @@ -333,7 +341,7 @@ extern "C" { #define BENCH_EMBEDDED #undef USE_CERT_BUFFERS_2048 -//#define USE_CERT_BUFFERS_2048 +#define USE_CERT_BUFFERS_2048 #undef USE_CERT_BUFFERS_1024 //#define USE_CERT_BUFFERS_1024 @@ -420,23 +428,20 @@ extern "C" { #define WOLFSSL_USER_CURRTIME #define WOLFSSL_GMTIME #define USER_TICKS -#endif - -#if !defined(WOLFSSL_SIFIVE_RISC_V) -// extern unsigned long my_time(unsigned long* timer); -// #define XTIME my_time +#else + // extern unsigned long my_time(unsigned long* timer); + // #define XTIME my_time #endif /* ------------------------------------------------------------------------- */ /* RNG */ /* ------------------------------------------------------------------------- */ - -#if 1 -/* Bypass P-RNG and use only HW RNG */ -#define CUSTOM_RAND_TYPE unsigned int -extern int my_rng_gen_block(unsigned char* output, unsigned int sz); -#undef CUSTOM_RAND_GENERATE_BLOCK -#define CUSTOM_RAND_GENERATE_BLOCK my_rng_gen_block +#if 0 + /* Bypass P-RNG and use only HW RNG */ + #define CUSTOM_RAND_TYPE unsigned int + extern int my_rng_gen_block(unsigned char* output, unsigned int sz); + #undef CUSTOM_RAND_GENERATE_BLOCK + #define CUSTOM_RAND_GENERATE_BLOCK my_rng_gen_block #else #define HAVE_HASHDRBG @@ -477,7 +482,7 @@ extern int my_rng_gen_block(unsigned char* output, unsigned int sz); #define HAVE_SUPPORTED_CURVES #undef WOLFSSL_BASE64_ENCODE -#define WOLFSSL_BASE64_ENCODE +//#define WOLFSSL_BASE64_ENCODE /* TLS Session Cache */ #if 0 diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c index a9a7c8bb3..ea9efca1a 100644 --- a/wolfcrypt/src/sp_c32.c +++ b/wolfcrypt/src/sp_c32.c @@ -7369,7 +7369,7 @@ static int sp_256_mod_mul_norm_10(sp_digit* r, sp_digit* a, sp_digit* m) (void)m; #if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK) - td = (sp_digit*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC); + td = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC); if (td != NULL) { t = td; a32 = td + 8; diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c index 046d432d2..417825de2 100644 --- a/wolfcrypt/src/sp_c64.c +++ b/wolfcrypt/src/sp_c64.c @@ -7212,7 +7212,7 @@ static int sp_256_mod_mul_norm_5(sp_digit* r, sp_digit* a, sp_digit* m) (void)m; #if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK) - td = (sp_digit*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC); + td = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC); if (td != NULL) { t = td; a32 = td + 8; From 090899e74ed2ad5124f2b89e0f217674293e66a9 Mon Sep 17 00:00:00 2001 From: Jacob Barthelmeh Date: Thu, 6 Jun 2019 14:43:07 +0700 Subject: [PATCH 11/21] remove store of last block w/o padding and adjust padding last block case --- wolfcrypt/src/evp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/evp.c b/wolfcrypt/src/evp.c index f92d4e8d4..d0164d588 100644 --- a/wolfcrypt/src/evp.c +++ b/wolfcrypt/src/evp.c @@ -358,15 +358,15 @@ WOLFSSL_API int wolfSSL_EVP_CipherUpdate(WOLFSSL_EVP_CIPHER_CTX *ctx, if ((ctx->flags & WOLFSSL_EVP_CIPH_NO_PADDING) || (ctx->block_size == 1)) { ctx->lastUsed = 0; - XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks], ctx->block_size); *outl+= ctx->block_size * blocks; } else { if (inl == 0) { ctx->lastUsed = 1; blocks = blocks - 1; /* save last block to check padding in * EVP_CipherFinal call */ + XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks], + ctx->block_size); } - XMEMCPY(ctx->lastBlock, &out[ctx->block_size * blocks], ctx->block_size); *outl+= ctx->block_size * blocks; } } else { From 87d24f44883117afa404734434d7be53cf60945c Mon Sep 17 00:00:00 2001 From: Jacob Barthelmeh Date: Thu, 6 Jun 2019 16:14:00 +0700 Subject: [PATCH 12/21] add additional test case for evp --- wolfcrypt/src/evp.c | 5 +- wolfcrypt/test/test.c | 110 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/wolfcrypt/src/evp.c b/wolfcrypt/src/evp.c index d0164d588..c6d0f643c 100644 --- a/wolfcrypt/src/evp.c +++ b/wolfcrypt/src/evp.c @@ -446,7 +446,10 @@ WOLFSSL_API int wolfSSL_EVP_CipherFinal(WOLFSSL_EVP_CIPHER_CTX *ctx, if ((fl = checkPad(ctx, ctx->lastBlock)) >= 0) { XMEMCPY(out, ctx->lastBlock, fl); *outl = fl; - } else return 0; + } + else { + return WOLFSSL_FAILURE; + } } /* return error in cases where the block length is incorrect */ if (ctx->lastUsed == 0 && ctx->bufUsed == 0) { diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 9e29cf59e..7a56326e8 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -13094,6 +13094,113 @@ static int openssl_aes_test(void) return -7334; } + /* set buffers to be exact size to catch potential over read/write */ + { + /* EVP_CipherUpdate test */ + const byte cbcPlain[] = + { + 0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96, + 0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a, + 0xae,0x2d,0x8a,0x57,0x1e,0x03,0xac,0x9c, + 0x9e,0xb7,0x6f,0xac,0x45,0xaf,0x8e,0x51, + 0x30,0xc8,0x1c,0x46,0xa3,0x5c,0xe4,0x11, + 0xe5,0xfb,0xc1,0x19,0x1a,0x0a,0x52,0xef, + 0xf6,0x9f,0x24,0x45,0xdf,0x4f,0x9b,0x17, + 0xad,0x2b,0x41,0x7b,0xe6,0x6c,0x37,0x10 + }; + + byte key[] = "0123456789abcdef "; /* align */ + byte iv[] = "1234567890abcdef "; /* align */ + + #define EVP_TEST_BUF_SZ 18 + #define EVP_TEST_BUF_PAD 32 + byte cipher[EVP_TEST_BUF_SZ]; + byte plain [EVP_TEST_BUF_SZ]; + byte padded[EVP_TEST_BUF_PAD]; + EVP_CIPHER_CTX en; + EVP_CIPHER_CTX de; + int outlen ; + int total = 0; + + EVP_CIPHER_CTX_init(&en); + if (EVP_CipherInit(&en, EVP_aes_128_cbc(), + (unsigned char*)key, (unsigned char*)iv, 1) == 0) + return -7370; + if (EVP_CIPHER_CTX_set_padding(&en, 0) != 1) + return -7372; + if (EVP_CipherUpdate(&en, (byte*)cipher, &outlen, + (byte*)cbcPlain, EVP_TEST_BUF_SZ) == 0) + return -7372; + if (outlen != 16) + return -7373; + total += outlen; + + /* should fail here */ + if (EVP_CipherFinal(&en, (byte*)&cipher[total], &outlen) != 0) + return -7374; + + /* turn padding back on and do successful encrypt */ + total = 0; + EVP_CIPHER_CTX_init(&en); + if (EVP_CipherInit(&en, EVP_aes_128_cbc(), + (unsigned char*)key, (unsigned char*)iv, 1) == 0) + return -7375; + if (EVP_CIPHER_CTX_set_padding(&en, 1) != 1) + return -7376; + if (EVP_CipherUpdate(&en, (byte*)padded, &outlen, + (byte*)cbcPlain, EVP_TEST_BUF_SZ) == 0) + return -7377; + if (outlen != 16) + return -7378; + total += outlen; + + if (EVP_CipherFinal(&en, (byte*)&padded[total], &outlen) == 0) + return -7379; + total += outlen; + if (total != 32) + return -7380; + XMEMCPY(cipher, padded, EVP_TEST_BUF_SZ); + + /* test out of bounds read on buffers w/o padding during decryption */ + total = 0; + EVP_CIPHER_CTX_init(&de); + if (EVP_CipherInit(&de, EVP_aes_128_cbc(), + (unsigned char*)key, (unsigned char*)iv, 0) == 0) + return -7381; + + if (EVP_CIPHER_CTX_set_padding(&de, 0) != 1) + return -7382; + if (EVP_CipherUpdate(&de, (byte*)plain, &outlen, (byte*)cipher, + EVP_TEST_BUF_SZ) == 0) + return -7383; + if (outlen != 16) + return -7384; + total += outlen; + + /* should fail since not using padding */ + if (EVP_CipherFinal(&de, (byte*)&plain[total], &outlen) != 0) + return -7385; + + total = 0; + EVP_CIPHER_CTX_init(&de); + if (EVP_CipherInit(&de, EVP_aes_128_cbc(), + (unsigned char*)key, (unsigned char*)iv, 0) == 0) + return -7386; + if (EVP_CIPHER_CTX_set_padding(&de, 1) != 1) + return -7387; + if (EVP_CipherUpdate(&de, (byte*)padded, &outlen, (byte*)padded, + EVP_TEST_BUF_PAD) == 0) + return -7388; + if (outlen != 16) + return -7389; + total += outlen; + + if (EVP_CipherFinal(&de, (byte*)&padded[total], &outlen) == 0) + return -7390; + if (XMEMCMP(padded, cbcPlain, EVP_TEST_BUF_SZ)) + return -7391; + } + { /* evp_cipher test: EVP_aes_128_cbc */ EVP_CIPHER_CTX ctx; @@ -13774,8 +13881,9 @@ int openssl_test(void) #endif /* NO_DES3 */ #if !defined(NO_AES) && !defined(WOLFCRYPT_ONLY) - if (openssl_aes_test() != 0) + if (openssl_aes_test() != 0) { return -7412; + } #if defined(WOLFSSL_AES_128) && defined(HAVE_AES_CBC) { /* evp_cipher test: EVP_aes_128_cbc */ From 292aa196fb5dd91793b5d16707a7de8625bde292 Mon Sep 17 00:00:00 2001 From: Hideki Miyazaki Date: Fri, 7 Jun 2019 12:05:19 +0900 Subject: [PATCH 13/21] minor fix to print stats in tls_bench --- examples/benchmark/tls_bench.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmark/tls_bench.c b/examples/benchmark/tls_bench.c index c52935ad7..5be7f63db 100644 --- a/examples/benchmark/tls_bench.c +++ b/examples/benchmark/tls_bench.c @@ -1195,10 +1195,10 @@ static void print_stats(stats_t* wcStat, const char* desc, const char* cipher, i cipher, wcStat->txTotal + wcStat->rxTotal, wcStat->connCount, - wcStat->txTime * 1000, wcStat->rxTime * 1000, - wcStat->txTotal / wcStat->txTime / 1024 / 1024, + wcStat->txTime * 1000, wcStat->rxTotal / wcStat->rxTime / 1024 / 1024, + wcStat->txTotal / wcStat->txTime / 1024 / 1024, wcStat->connTime * 1000, wcStat->connTime * 1000 / wcStat->connCount); } From efcf06ef7c10b493674d1c2c76e46a668ba6b4b7 Mon Sep 17 00:00:00 2001 From: Hideki Miyazaki Date: Fri, 7 Jun 2019 19:13:46 +0900 Subject: [PATCH 14/21] use tlsv1_2 client method when tls13 is enabled --- examples/benchmark/tls_bench.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/benchmark/tls_bench.c b/examples/benchmark/tls_bench.c index 5be7f63db..cd3f8fedd 100644 --- a/examples/benchmark/tls_bench.c +++ b/examples/benchmark/tls_bench.c @@ -611,7 +611,12 @@ static int bench_tls_client(info_t* info) cli_ctx = wolfSSL_CTX_new(wolfTLSv1_3_client_method()); #endif if (!tls13) +#if !defined(WOLFSSL_TLS13) cli_ctx = wolfSSL_CTX_new(wolfSSLv23_client_method()); +#elif !defined(WOLFSSL_NO_TLS12) + cli_ctx = wolfSSL_CTX_new(wolfTLSv1_2_client_method()); +#endif + if (cli_ctx == NULL) { printf("error creating ctx\n"); ret = MEMORY_E; goto exit; From d7ab3a6dece5285acc85ec6d4600154c075c5aee Mon Sep 17 00:00:00 2001 From: David Garske Date: Fri, 7 Jun 2019 13:58:17 -0700 Subject: [PATCH 15/21] Further `user_settings.h` cleanup / additions. Additional README.md comments. --- IDE/ECLIPSE/SIFIVE/Makefile | 4 +- IDE/ECLIPSE/SIFIVE/main.c | 6 +-- IDE/ECLIPSE/SIFIVE/user_settings.h | 86 ++++++++++++++++++------------ 3 files changed, 59 insertions(+), 37 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/Makefile b/IDE/ECLIPSE/SIFIVE/Makefile index b19c86f6a..594686aa2 100644 --- a/IDE/ECLIPSE/SIFIVE/Makefile +++ b/IDE/ECLIPSE/SIFIVE/Makefile @@ -22,13 +22,15 @@ OPT_CFLAGS = -specs=nano.specs #OPT_CFLAGS += -O3 -DTIME -DNOENUM -Wno-implicit -mexplicit-relocs -save-temps #OPT_CFLAGS += -fno-inline -fno-builtin-printf -fno-common -falign-functions=4 -# ovewrite the __stack_size default value of 0x400 with 0x1000(4 Kbytes). +# override the __stack_size and __heap_size default values of 0x400 +# SiFive HiFive1 has 16KB of data SRAM # The __stack_size and __heap_size symbols are defined in the linker metal.default.ld # script in the freedom-e-sdk. override CFLAGS += $(OPT_CFLAGS) $(WOLFSSL_CFLAGS) \ -Xlinker --defsym=__stack_size=0x1200 \ -Xlinker --defsym=__heap_size=0x800 + $(PROGRAM): $(SRC) $(CC) $(CFLAGS) $(SRC) $(LDFLAGS) $(LDLIBS) -o $@ diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index 99369e33e..50c398ae5 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -130,7 +130,7 @@ int my_rng_gen_block(unsigned char* output, unsigned int sz) int main(void) { int ret; - long clkHz = 16000000; /* default */ + long clk_Hz = 16000000; /* default */ #if WOLFSSL_SIFIVE_RISC_V_DEBUG printf("check stack and heap addresses\n"); @@ -144,11 +144,11 @@ int main(void) /* Speed up clock */ printf("SiFive HiFive1 Demo\n"); printf("Setting clock to %dMHz\n", USE_CLOCK_HZ/1000000); - clkHz = metal_clock_set_rate_hz( + clk_Hz = metal_clock_set_rate_hz( &__METAL_DT_SIFIVE_FE310_G000_PLL_HANDLE->clock, USE_CLOCK_HZ ); #endif - printf("Actual Clock %dMHz\n", clkHz/1000000); + printf("Actual Clock %dMHz\n", clk_Hz/1000000); #ifdef DEBUG_WOLFSSL wolfSSL_Debugging_ON(); diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h index 249783c40..9e42bb476 100644 --- a/IDE/ECLIPSE/SIFIVE/user_settings.h +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -70,22 +70,9 @@ extern "C" { //#define TFM_ARM #endif -/* Wolf Single Precision Math */ -/* Optional ECC SECP256R1 acceleration using optimized C code */ -#undef WOLFSSL_SP -#if 1 - #define WOLFSSL_SP - #define WOLFSSL_SP_SMALL /* use smaller version of code (requires heap) */ - #define SP_WORD_SIZE 32 /* force 32-bit type */ - #define WOLFSSL_SP_MATH /* only SP math - eliminates fast math code */ - //#define WOLFSSL_SP_DIV_32 /* do not use 64-bit divides */ - - #define WOLFSSL_HAVE_SP_ECC - //#define WOLFSSL_HAVE_SP_RSA -#endif /* ------------------------------------------------------------------------- */ -/* Crypto */ +/* Asymmetric */ /* ------------------------------------------------------------------------- */ /* RSA */ /* Not enabled due to memory constraints on HiFive1 */ @@ -205,6 +192,43 @@ extern "C" { #endif +/* Wolf Single Precision Math */ +/* Optional ECC SECP256R1 acceleration using optimized C code */ +#undef WOLFSSL_SP +#if 1 + #define WOLFSSL_SP + #define WOLFSSL_SP_SMALL /* use smaller version of code (requires heap) */ + #define SP_WORD_SIZE 32 /* force 32-bit type */ + #define WOLFSSL_SP_MATH /* only SP math - eliminates fast math code */ + //#define WOLFSSL_SP_DIV_32 /* do not use 64-bit divides */ + + #ifdef HAVE_ECC + #define WOLFSSL_HAVE_SP_ECC + #endif + #ifndef NO_RSA + #define WOLFSSL_HAVE_SP_RSA + #endif +#endif + +/* Ed25519 / Curve25519 */ +#undef HAVE_CURVE25519 +#undef HAVE_ED25519 +#if 1 + #define HAVE_CURVE25519 + #define HAVE_ED25519 /* ED25519 Requires SHA512 */ + + /* Optionally use small math (less flash usage, but much slower) */ + #if 1 + /* Curve and Ed 25519 small */ + #define CURVED25519_SMALL + #endif +#endif + + +/* ------------------------------------------------------------------------- */ +/* Symmetric Ciphers */ +/* ------------------------------------------------------------------------- */ + /* AES */ #undef NO_AES #if 1 @@ -230,7 +254,6 @@ extern "C" { //#define HAVE_AESCCM #endif - /* DES3 */ #undef NO_DES3 #if 0 @@ -250,23 +273,9 @@ extern "C" { #define HAVE_ONE_TIME_AUTH #endif -/* Ed25519 / Curve25519 */ -#undef HAVE_CURVE25519 -#undef HAVE_ED25519 -#if 1 - #define HAVE_CURVE25519 - #define HAVE_ED25519 /* ED25519 Requires SHA512 */ - - /* Optionally use small math (less flash usage, but much slower) */ - #if 1 - /* Curve and Ed 25519 small */ - #define CURVED25519_SMALL - #endif -#endif - /* ------------------------------------------------------------------------- */ -/* Hashing */ +/* Symmetric Hashing */ /* ------------------------------------------------------------------------- */ /* Sha */ #undef NO_SHA @@ -320,6 +329,18 @@ extern "C" { #define NO_MD5 #endif +/* Blake2B */ +#undef HAVE_BLAKE2 +#if 0 + #define HAVE_BLAKE2 +#endif + +/* Blake2S */ +#undef HAVE_BLAKE2S +#if 0 + #define HAVE_BLAKE2S +#endif + /* HKDF */ #undef HAVE_HKDF #if 0 @@ -386,6 +407,7 @@ extern "C" { #define XREALLOC(p, n, h, t) myRealloc(p, n, h, t) #endif +/* Static memory */ #if 0 /* Static memory requires fast math */ #define WOLFSSL_STATIC_MEMORY @@ -423,9 +445,8 @@ extern "C" { /* ------------------------------------------------------------------------- */ /* Override Current Time */ -/* Allows custom "custom_time()" function to be used for benchmark */ #if defined(WOLFSSL_SIFIVE_RISC_V) - #define WOLFSSL_USER_CURRTIME + #define WOLFSSL_USER_CURRTIME /* for benchmarks, uses "custom_time()" function */ #define WOLFSSL_GMTIME #define USER_TICKS #else @@ -569,4 +590,3 @@ extern "C" { #endif #endif /* WOLFSSL_USER_SETTINGS_H */ - From c5aa13021aa8c27312b04a5bc57fa13c0b3460fa Mon Sep 17 00:00:00 2001 From: kaleb-himes Date: Fri, 7 Jun 2019 16:09:35 -0600 Subject: [PATCH 16/21] Adjustment to test.c for merge of WCv4.0.1 changes --- wolfcrypt/test/test.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 9e29cf59e..7ba338c45 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -5457,10 +5457,17 @@ static int aes_key_size_test(void) word32 keySize; #endif +#if !defined(HAVE_FIPS) || \ + defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) + /* w/ FIPS v1 (cert 2425) wc_AesInit just returns 0 always as it's not + * supported with that FIPS version */ ret = wc_AesInit(NULL, HEAP_HINT, devId); if (ret != BAD_FUNC_ARG) return -4800; +#endif + ret = wc_AesInit(&aes, HEAP_HINT, devId); + /* 0 check OK for FIPSv1 */ if (ret != 0) return -4801; From cdf0241ed0e7b287c2e4965874c582ce01577c82 Mon Sep 17 00:00:00 2001 From: David Garske Date: Sun, 9 Jun 2019 19:43:55 +0200 Subject: [PATCH 17/21] Updated benchmark with SHA512 "slow" version, which performs faster with the RISC-V compiler optimizations. --- IDE/ECLIPSE/SIFIVE/README.md | 2 +- IDE/ECLIPSE/SIFIVE/user_settings.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index 385ddceee..5ce2271bc 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -158,7 +158,7 @@ CHA-POLY 675 KB took 1.021 seconds, 661.060 KB/s POLY1305 2 MB took 1.007 seconds, 2.230 MB/s SHA 1 MB took 1.016 seconds, 1.321 MB/s SHA-256 425 KB took 1.005 seconds, 422.909 KB/s -SHA-512 25 KB took 2.043 seconds, 12.239 KB/s +SHA-512 225 KB took 1.009 seconds, 223.073 KB/s HMAC-SHA 1 MB took 1.010 seconds, 1.378 MB/s HMAC-SHA256 425 KB took 1.037 seconds, 409.781 KB/s HMAC-SHA512 25 KB took 2.075 seconds, 12.050 KB/s diff --git a/IDE/ECLIPSE/SIFIVE/user_settings.h b/IDE/ECLIPSE/SIFIVE/user_settings.h index 9e42bb476..2f7f136cb 100644 --- a/IDE/ECLIPSE/SIFIVE/user_settings.h +++ b/IDE/ECLIPSE/SIFIVE/user_settings.h @@ -312,7 +312,7 @@ extern "C" { #endif /* over twice as small, but 50% slower */ - //#define USE_SLOW_SHA512 + #define USE_SLOW_SHA512 #endif /* Sha3 */ From a48981c3c6b6b72ecabceef82b54c93b70f37d91 Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Fri, 10 May 2019 14:26:53 +0200 Subject: [PATCH 18/21] Chacha20 ARM optimization --- .gitignore | 5 + IDE/LPCXPRESSO/README.md | 19 +- .cproject => LPCExpresso.cproject | 0 .project => LPCExpresso.project | 0 src/include.am | 4 + wolfcrypt/src/chacha.c | 5 + wolfcrypt/src/include.am | 3 + wolfcrypt/src/port/arm/armv8-chacha.c | 2858 +++++++++++++++++++++++++ wolfcrypt/test/test.c | 212 +- 9 files changed, 3093 insertions(+), 13 deletions(-) rename .cproject => LPCExpresso.cproject (100%) rename .project => LPCExpresso.project (100%) create mode 100644 wolfcrypt/src/port/arm/armv8-chacha.c diff --git a/.gitignore b/.gitignore index 92d94d892..6149b85d7 100644 --- a/.gitignore +++ b/.gitignore @@ -321,3 +321,8 @@ doc/pdf # XCODE Index IDE/XCODE/Index + +# ARM DS-5 +\.settings/ +\.cproject +\.project diff --git a/IDE/LPCXPRESSO/README.md b/IDE/LPCXPRESSO/README.md index 9a93c021a..e934caa20 100644 --- a/IDE/LPCXPRESSO/README.md +++ b/IDE/LPCXPRESSO/README.md @@ -2,15 +2,16 @@ To use, install the NXP LPCXpresso IDE and import the projects in a new workspace. -1. Run LPCXpresso and choose a workspace location. -2. Right click in the project exporer window and choose Inport. -3. Under General choose "Existing Projects into Workspace". -4. Under "Select root directory" click browse and select the wolfSSL root. -5. Check the "Search for nested projects" box. -5. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:". -6. Click finish. -7. Download the board and chip LPCOpen package for your platform. -8. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37. +1. Change names of `LPCExpresso.project` and `LPCExpresso.cproject` files to `.project` and `.cproject` +2. Run LPCXpresso and choose a workspace location. +3. Right click in the project explorer window and choose Import. +4. Under General choose "Existing Projects into Workspace". +5. Under "Select root directory" click browse and select the wolfSSL root. +6. Check the "Search for nested projects" box. +7. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:". +8. Click finish. +9. Download the board and chip LPCOpen package for your platform. +10. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37. To setup this example to work with different baords/chips you will need to locate the LPCOpen sources for LPCXpresso on the NXP website and import the board and chip projects. Then you will need to update the "wolfssl_example" project properties to reference these projects (C/C++ General -> Paths and Symbols -> References). See the [LPCOpen v2.xx LPCXpresso quickstart guide for all platforms](https://www.lpcware.com/content/project/lpcopen-platform-nxp-lpc-microcontrollers/lpcopen-v200-quickstart-guides/lpcopen-1) for additional information. diff --git a/.cproject b/LPCExpresso.cproject similarity index 100% rename from .cproject rename to LPCExpresso.cproject diff --git a/.project b/LPCExpresso.project similarity index 100% rename from .project rename to LPCExpresso.project diff --git a/src/include.am b/src/include.am index 4861f00ce..2eb5697e7 100644 --- a/src/include.am +++ b/src/include.am @@ -340,10 +340,14 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/rabbit.c endif if BUILD_CHACHA +if BUILD_ARMASM +src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c +else src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha.c if BUILD_INTELASM src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha_asm.S endif +endif if BUILD_POLY1305 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c endif diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index f4d041800..71b81086b 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -27,6 +27,10 @@ */ +#ifdef WOLFSSL_ARMASM + /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */ + +#else #ifdef HAVE_CONFIG_H #include #endif @@ -316,3 +320,4 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, #endif /* HAVE_CHACHA*/ +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 3e7b3a377..ba1f7b6a7 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -48,6 +48,9 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \ wolfcrypt/src/port/ti/ti-ccm.c \ wolfcrypt/src/port/pic32/pic32mz-crypt.c \ wolfcrypt/src/port/nrf51.c \ + wolfcrypt/src/port/arm/armv8-aes.c \ + wolfcrypt/src/port/arm/armv8-sha256.c \ + wolfcrypt/src/port/arm/armv8-chacha.c \ wolfcrypt/src/port/arm/armv8-curve25519.c \ wolfcrypt/src/port/arm/armv7-curve25519.c \ wolfcrypt/src/port/arm/armv8-sha512-asm.c \ diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c new file mode 100644 index 000000000..8eebc0334 --- /dev/null +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -0,0 +1,2858 @@ +/* armv8-chacha.c + * + * Copyright (C) 2006-2019 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + * + */ + +/* The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM + * https://cryptojedi.org/papers/neoncrypto-20120320.pdf + */ + +#ifdef WOLFSSL_ARMASM + +#ifdef HAVE_CONFIG_H + #include +#endif + +#include + +#ifdef HAVE_CHACHA + +#include +#include +#include +#include +#ifdef NO_INLINE + #include +#else + #define WOLFSSL_MISC_INCLUDED + #include +#endif + +#ifdef CHACHA_AEAD_TEST + #include +#endif + +#ifdef CHACHA_TEST + #include +#endif + +#ifdef BIG_ENDIAN_ORDER + #define LITTLE32(x) ByteReverseWord32(x) +#else + #define LITTLE32(x) (x) +#endif + +/* Number of rounds */ +#define ROUNDS 20 + +#define U32C(v) (v##U) +#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) +#define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0]) + +#define PLUS(v,w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v),1)) + +#define ARM_SIMD_LEN_BYTES 16 + +/** + * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version + * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. + */ +int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) +{ + word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ + +#ifdef CHACHA_AEAD_TEST + word32 i; + printf("NONCE : "); + for (i = 0; i < CHACHA_IV_BYTES; i++) { + printf("%02x", inIv[i]); + } + printf("\n\n"); +#endif + + if (ctx == NULL) + return BAD_FUNC_ARG; + + XMEMCPY(temp, inIv, CHACHA_IV_BYTES); + + ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ + ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */ + ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */ + ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ + + return 0; +} + +/* "expand 32-byte k" as unsigned 32 byte */ +static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +/* "expand 16-byte k" as unsigned 16 byte */ +static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; + +/** + * Key setup. 8 word iv (nonce) + */ +int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) +{ + const word32* constants; + const byte* k; + +#ifdef XSTREAM_ALIGN + word32 alignKey[8]; +#endif + + if (ctx == NULL) + return BAD_FUNC_ARG; + + if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) + return BAD_FUNC_ARG; + +#ifdef XSTREAM_ALIGN + if ((wolfssl_word)key % 4) { + WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); + XMEMCPY(alignKey, key, keySz); + k = (byte*)alignKey; + } + else { + k = key; + } +#else + k = key; +#endif /* XSTREAM_ALIGN */ + +#ifdef CHACHA_AEAD_TEST + word32 i; + printf("ChaCha key used :\n"); + for (i = 0; i < keySz; i++) { + printf("%02x", key[i]); + if ((i + 1) % 8 == 0) + printf("\n"); + } + printf("\n\n"); +#endif + + ctx->X[4] = U8TO32_LITTLE(k + 0); + ctx->X[5] = U8TO32_LITTLE(k + 4); + ctx->X[6] = U8TO32_LITTLE(k + 8); + ctx->X[7] = U8TO32_LITTLE(k + 12); + if (keySz == CHACHA_MAX_KEY_SZ) { + k += 16; + constants = sigma; + } + else { + constants = tau; + } + ctx->X[ 8] = U8TO32_LITTLE(k + 0); + ctx->X[ 9] = U8TO32_LITTLE(k + 4); + ctx->X[10] = U8TO32_LITTLE(k + 8); + ctx->X[11] = U8TO32_LITTLE(k + 12); + ctx->X[ 0] = constants[0]; + ctx->X[ 1] = constants[1]; + ctx->X[ 2] = constants[2]; + ctx->X[ 3] = constants[3]; + + return 0; +} + +static const word32 L_chacha20_neon_inc_first_word[] = { + 0x1, + 0x0, + 0x0, + 0x0, +}; + +#ifdef __aarch64__ + +static const word32 L_chacha20_neon_add_all_counters[] = { + 0x0, + 0x1, + 0x2, + 0x3, +}; + +static const word32 L_chacha20_neon_rol8[] = { + 0x2010003, + 0x6050407, + 0xa09080b, + 0xe0d0c0f, +}; + +static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m, byte* c, word32 bytes) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_320 with %d bytes\n", bytes); +#endif /*CHACHA_TEST */ + + __asm__ __volatile__ ( + /* + * The layout of used registers is: + * ARM + * w4-w19: these registers hold the fifth Chacha block for calculation in regular ARM + * w20: loop counter for how many even-odd rounds need to be executed + * w21: the counter offset for the block in ARM registers + * NEON + * v0-v15: the vi'th register holds the i'th word of four blocks during the quarter rounds. + * these registers are later transposed make ADDing the input and XORing the message easier. + * v16-v19: these are helper registers that are used as temporary location to store data + * v20-v23: load the next message block + * v24-v27: the 64 byte intial Chacha block + * v28: vector to increment the counter words of each block + * v29: vector of 5's to increment counters between L_chacha20_arm64_outer_%= loops + * v30: table lookup indices to rotate values by 8 + */ + + /* Load counter-add values for each block */ + "LD1 {v28.4s}, [%[L_chacha20_neon_add_all_counters]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v30.16b}, [%[L_chacha20_neon_rol8]] \n\t" + /* For adding 5 to each counter-add for next 320-byte chunk */ + "MOVI v29.4s, #5 \n\t" + /* Counter for 5th block in regular registers */ + "MOV w21, #4 \n\t" + /* Load state to encrypt */ + "LD1 {v24.4s-v27.4s}, [%[input]] \n\t" + "\n" + "L_chacha20_arm64_outer_%=: \n\t" + /* Move state into regular registers */ + "MOV x4, v24.d[0] \n\t" + "MOV x6, v24.d[1] \n\t" + "MOV x8, v25.d[0] \n\t" + "MOV x10, v25.d[1] \n\t" + "MOV x12, v26.d[0] \n\t" + "MOV x14, v26.d[1] \n\t" + "MOV x16, v27.d[0] \n\t" + "MOV x18, v27.d[1] \n\t" + /* Move state into vector registers (x4) */ + "DUP v0.4s, v24.s[0] \n\t" + "DUP v1.4s, v24.s[1] \n\t" + "LSR x5, x4, #32 \n\t" + "DUP v2.4s, v24.s[2] \n\t" + "DUP v3.4s, v24.s[3] \n\t" + "LSR x7, x6, #32 \n\t" + "DUP v4.4s, v25.s[0] \n\t" + "DUP v5.4s, v25.s[1] \n\t" + "LSR x9, x8, #32 \n\t" + "DUP v6.4s, v25.s[2] \n\t" + "DUP v7.4s, v25.s[3] \n\t" + "LSR x11, x10, #32 \n\t" + "DUP v8.4s, v26.s[0] \n\t" + "DUP v9.4s, v26.s[1] \n\t" + "LSR x13, x12, #32 \n\t" + "DUP v10.4s, v26.s[2] \n\t" + "DUP v11.4s, v26.s[3] \n\t" + "LSR x15, x14, #32 \n\t" + "DUP v12.4s, v27.s[0] \n\t" + "DUP v13.4s, v27.s[1] \n\t" + "LSR x17, x16, #32 \n\t" + "DUP v14.4s, v27.s[2] \n\t" + "DUP v15.4s, v27.s[3] \n\t" + "LSR x19, x18, #32 \n\t" + /* Add to counter word */ + "ADD v12.4s, v12.4s, v28.4s \n\t" + "ADD w16, w16, w21 \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w20, #10 \n\t" + "\n" + "L_chacha20_arm64_inner_%=: \n\t" + "SUBS w20, w20, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4s, v0.4s, v4.4s \n\t" + "ADD w4, w4, w8 \n\t" + "ADD v1.4s, v1.4s, v5.4s \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v2.4s, v2.4s, v6.4s \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v3.4s, v3.4s, v7.4s \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v12.16b, v12.16b, v0.16b \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v13.16b, v13.16b, v1.16b \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v14.16b, v14.16b, v2.16b \n\t" + "EOR w18, w18, w6 \n\t" + "EOR v15.16b, v15.16b, v3.16b \n\t" + "EOR w19, w19, w7 \n\t" + "REV32 v12.8h, v12.8h \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v13.8h, v13.8h \n\t" + "ROR w17, w17, #16 \n\t" + "REV32 v14.8h, v14.8h \n\t" + "ROR w18, w18, #16 \n\t" + "REV32 v15.8h, v15.8h \n\t" + "ROR w19, w19, #16 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v8.4s, v8.4s, v12.4s \n\t" + "ADD w12, w12, w16 \n\t" + "ADD v9.4s, v9.4s, v13.4s \n\t" + "ADD w13, w13, w17 \n\t" + "ADD v10.4s, v10.4s, v14.4s \n\t" + "ADD w14, w14, w18 \n\t" + "ADD v11.4s, v11.4s, v15.4s \n\t" + "ADD w15, w15, w19 \n\t" + "EOR v16.16b, v4.16b, v8.16b \n\t" + "EOR w8, w8, w12 \n\t" + "EOR v17.16b, v5.16b, v9.16b \n\t" + "EOR w9, w9, w13 \n\t" + "EOR v18.16b, v6.16b, v10.16b \n\t" + "EOR w10, w10, w14 \n\t" + "EOR v19.16b, v7.16b, v11.16b \n\t" + "EOR w11, w11, w15 \n\t" + "SHL v4.4s, v16.4s, #12 \n\t" + "ROR w8, w8, #20 \n\t" + "SHL v5.4s, v17.4s, #12 \n\t" + "ROR w9, w9, #20 \n\t" + "SHL v6.4s, v18.4s, #12 \n\t" + "ROR w10, w10, #20 \n\t" + "SHL v7.4s, v19.4s, #12 \n\t" + "ROR w11, w11, #20 \n\t" + "SRI v4.4s, v16.4s, #20 \n\t" + "SRI v5.4s, v17.4s, #20 \n\t" + "SRI v6.4s, v18.4s, #20 \n\t" + "SRI v7.4s, v19.4s, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4s, v0.4s, v4.4s \n\t" + "ADD w4, w4, w8 \n\t" + "ADD v1.4s, v1.4s, v5.4s \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v2.4s, v2.4s, v6.4s \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v3.4s, v3.4s, v7.4s \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v12.16b, v12.16b, v0.16b \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v13.16b, v13.16b, v1.16b \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v14.16b, v14.16b, v2.16b \n\t" + "EOR w18, w18, w6 \n\t" + "EOR v15.16b, v15.16b, v3.16b \n\t" + "EOR w19, w19, w7 \n\t" + "TBL v12.16b, { v12.16b }, v30.16b \n\t" + "ROR w16, w16, #24 \n\t" + "TBL v13.16b, { v13.16b }, v30.16b \n\t" + "ROR w17, w17, #24 \n\t" + "TBL v14.16b, { v14.16b }, v30.16b \n\t" + "ROR w18, w18, #24 \n\t" + "TBL v15.16b, { v15.16b }, v30.16b \n\t" + "ROR w19, w19, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v8.4s, v8.4s, v12.4s \n\t" + "ADD w12, w12, w16 \n\t" + "ADD v9.4s, v9.4s, v13.4s \n\t" + "ADD w13, w13, w17 \n\t" + "ADD v10.4s, v10.4s, v14.4s \n\t" + "ADD w14, w14, w18 \n\t" + "ADD v11.4s, v11.4s, v15.4s \n\t" + "ADD w15, w15, w19 \n\t" + "EOR v16.16b, v4.16b, v8.16b \n\t" + "EOR w8, w8, w12 \n\t" + "EOR v17.16b, v5.16b, v9.16b \n\t" + "EOR w9, w9, w13 \n\t" + "EOR v18.16b, v6.16b, v10.16b \n\t" + "EOR w10, w10, w14 \n\t" + "EOR v19.16b, v7.16b, v11.16b \n\t" + "EOR w11, w11, w15 \n\t" + "SHL v4.4s, v16.4s, #7 \n\t" + "ROR w8, w8, #25 \n\t" + "SHL v5.4s, v17.4s, #7 \n\t" + "ROR w9, w9, #25 \n\t" + "SHL v6.4s, v18.4s, #7 \n\t" + "ROR w10, w10, #25 \n\t" + "SHL v7.4s, v19.4s, #7 \n\t" + "ROR w11, w11, #25 \n\t" + "SRI v4.4s, v16.4s, #25 \n\t" + "SRI v5.4s, v17.4s, #25 \n\t" + "SRI v6.4s, v18.4s, #25 \n\t" + "SRI v7.4s, v19.4s, #25 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4s, v0.4s, v5.4s \n\t" + "ADD w4, w4, w9 \n\t" + "ADD v1.4s, v1.4s, v6.4s \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v2.4s, v2.4s, v7.4s \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v3.4s, v3.4s, v4.4s \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v15.16b, v15.16b, v0.16b \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v12.16b, v12.16b, v1.16b \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v13.16b, v13.16b, v2.16b \n\t" + "EOR w17, w17, w6 \n\t" + "EOR v14.16b, v14.16b, v3.16b \n\t" + "EOR w18, w18, w7 \n\t" + "REV32 v15.8h, v15.8h \n\t" + "ROR w19, w19, #16 \n\t" + "REV32 v12.8h, v12.8h \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v13.8h, v13.8h \n\t" + "ROR w17, w17, #16 \n\t" + "REV32 v14.8h, v14.8h \n\t" + "ROR w18, w18, #16 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v10.4s, v10.4s, v15.4s \n\t" + "ADD w14, w14, w19 \n\t" + "ADD v11.4s, v11.4s, v12.4s \n\t" + "ADD w15, w15, w16 \n\t" + "ADD v8.4s, v8.4s, v13.4s \n\t" + "ADD w12, w12, w17 \n\t" + "ADD v9.4s, v9.4s, v14.4s \n\t" + "ADD w13, w13, w18 \n\t" + "EOR v16.16b, v5.16b, v10.16b \n\t" + "EOR w9, w9, w14 \n\t" + "EOR v17.16b, v6.16b, v11.16b \n\t" + "EOR w10, w10, w15 \n\t" + "EOR v18.16b, v7.16b, v8.16b \n\t" + "EOR w11, w11, w12 \n\t" + "EOR v19.16b, v4.16b, v9.16b \n\t" + "EOR w8, w8, w13 \n\t" + "SHL v5.4s, v16.4s, #12 \n\t" + "ROR w9, w9, #20 \n\t" + "SHL v6.4s, v17.4s, #12 \n\t" + "ROR w10, w10, #20 \n\t" + "SHL v7.4s, v18.4s, #12 \n\t" + "ROR w11, w11, #20 \n\t" + "SHL v4.4s, v19.4s, #12 \n\t" + "ROR w8, w8, #20 \n\t" + "SRI v5.4s, v16.4s, #20 \n\t" + "SRI v6.4s, v17.4s, #20 \n\t" + "SRI v7.4s, v18.4s, #20 \n\t" + "SRI v4.4s, v19.4s, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4s, v0.4s, v5.4s \n\t" + "ADD w4, w4, w9 \n\t" + "ADD v1.4s, v1.4s, v6.4s \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v2.4s, v2.4s, v7.4s \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v3.4s, v3.4s, v4.4s \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v15.16b, v15.16b, v0.16b \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v12.16b, v12.16b, v1.16b \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v13.16b, v13.16b, v2.16b \n\t" + "EOR w17, w17, w6 \n\t" + "EOR v14.16b, v14.16b, v3.16b \n\t" + "EOR w18, w18, w7 \n\t" + "TBL v15.16b, { v15.16b }, v30.16b \n\t" + "ROR w19, w19, #24 \n\t" + "TBL v12.16b, { v12.16b }, v30.16b \n\t" + "ROR w16, w16, #24 \n\t" + "TBL v13.16b, { v13.16b }, v30.16b \n\t" + "ROR w17, w17, #24 \n\t" + "TBL v14.16b, { v14.16b }, v30.16b \n\t" + "ROR w18, w18, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v10.4s, v10.4s, v15.4s \n\t" + "ADD w14, w14, w19 \n\t" + "ADD v11.4s, v11.4s, v12.4s \n\t" + "ADD w15, w15, w16 \n\t" + "ADD v8.4s, v8.4s, v13.4s \n\t" + "ADD w12, w12, w17 \n\t" + "ADD v9.4s, v9.4s, v14.4s \n\t" + "ADD w13, w13, w18 \n\t" + "EOR v16.16b, v5.16b, v10.16b \n\t" + "EOR w9, w9, w14 \n\t" + "EOR v17.16b, v6.16b, v11.16b \n\t" + "EOR w10, w10, w15 \n\t" + "EOR v18.16b, v7.16b, v8.16b \n\t" + "EOR w11, w11, w12 \n\t" + "EOR v19.16b, v4.16b, v9.16b \n\t" + "EOR w8, w8, w13 \n\t" + "SHL v5.4s, v16.4s, #7 \n\t" + "ROR w9, w9, #25 \n\t" + "SHL v6.4s, v17.4s, #7 \n\t" + "ROR w10, w10, #25 \n\t" + "SHL v7.4s, v18.4s, #7 \n\t" + "ROR w11, w11, #25 \n\t" + "SHL v4.4s, v19.4s, #7 \n\t" + "ROR w8, w8, #25 \n\t" + "SRI v5.4s, v16.4s, #25 \n\t" + "SRI v6.4s, v17.4s, #25 \n\t" + "SRI v7.4s, v18.4s, #25 \n\t" + "SRI v4.4s, v19.4s, #25 \n\t" + "BNE L_chacha20_arm64_inner_%= \n\t" + /* Add counter now rather than after transposed */ + "ADD v12.4s, v12.4s, v28.4s \n\t" + "ADD w16, w16, w21 \n\t" + /* Load message */ + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + /* Transpose vectors */ + "TRN1 v16.4s, v0.4s, v1.4s \n\t" + "TRN1 v18.4s, v2.4s, v3.4s \n\t" + "TRN2 v17.4s, v0.4s, v1.4s \n\t" + "TRN2 v19.4s, v2.4s, v3.4s \n\t" + "TRN1 v0.2d, v16.2d, v18.2d \n\t" + "TRN1 v1.2d, v17.2d, v19.2d \n\t" + "TRN2 v2.2d, v16.2d, v18.2d \n\t" + "TRN2 v3.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v4.4s, v5.4s \n\t" + "TRN1 v18.4s, v6.4s, v7.4s \n\t" + "TRN2 v17.4s, v4.4s, v5.4s \n\t" + "TRN2 v19.4s, v6.4s, v7.4s \n\t" + "TRN1 v4.2d, v16.2d, v18.2d \n\t" + "TRN1 v5.2d, v17.2d, v19.2d \n\t" + "TRN2 v6.2d, v16.2d, v18.2d \n\t" + "TRN2 v7.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v8.4s, v9.4s \n\t" + "TRN1 v18.4s, v10.4s, v11.4s \n\t" + "TRN2 v17.4s, v8.4s, v9.4s \n\t" + "TRN2 v19.4s, v10.4s, v11.4s \n\t" + "TRN1 v8.2d, v16.2d, v18.2d \n\t" + "TRN1 v9.2d, v17.2d, v19.2d \n\t" + "TRN2 v10.2d, v16.2d, v18.2d \n\t" + "TRN2 v11.2d, v17.2d, v19.2d \n\t" + "TRN1 v16.4s, v12.4s, v13.4s \n\t" + "TRN1 v18.4s, v14.4s, v15.4s \n\t" + "TRN2 v17.4s, v12.4s, v13.4s \n\t" + "TRN2 v19.4s, v14.4s, v15.4s \n\t" + "TRN1 v12.2d, v16.2d, v18.2d \n\t" + "TRN1 v13.2d, v17.2d, v19.2d \n\t" + "TRN2 v14.2d, v16.2d, v18.2d \n\t" + "TRN2 v15.2d, v17.2d, v19.2d \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v16.4s, v0.4s, v24.4s \n\t" + "ADD v17.4s, v4.4s, v25.4s \n\t" + "ADD v18.4s, v8.4s, v26.4s \n\t" + "ADD v19.4s, v12.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v1.4s, v24.4s \n\t" + "ADD v17.4s, v5.4s, v25.4s \n\t" + "ADD v18.4s, v9.4s, v26.4s \n\t" + "ADD v19.4s, v13.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v2.4s, v24.4s \n\t" + "ADD v17.4s, v6.4s, v25.4s \n\t" + "ADD v18.4s, v10.4s, v26.4s \n\t" + "ADD v19.4s, v14.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "ADD v16.4s, v3.4s, v24.4s \n\t" + "ADD v17.4s, v7.4s, v25.4s \n\t" + "ADD v18.4s, v11.4s, v26.4s \n\t" + "ADD v19.4s, v15.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "LD1 {v20.4s-v23.4s}, [%[m]], #64 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + /* Move regular registers into vector registers for adding and xor */ + "ORR x4, x4, x5, LSL #32 \n\t" + "ORR x6, x6, x7, LSL #32 \n\t" + "ORR x8, x8, x9, LSL #32 \n\t" + "MOV v16.d[0], x4 \n\t" + "ORR x10, x10, x11, LSL #32 \n\t" + "MOV v16.d[1], x6 \n\t" + "ORR x12, x12, x13, LSL #32 \n\t" + "MOV v17.d[0], x8 \n\t" + "ORR x14, x14, x15, LSL #32 \n\t" + "MOV v17.d[1], x10 \n\t" + "ORR x16, x16, x17, LSL #32 \n\t" + "MOV v18.d[0], x12 \n\t" + "ORR x18, x18, x19, LSL #32 \n\t" + "MOV v18.d[1], x14 \n\t" + "MOV v19.d[0], x16 \n\t" + "MOV v19.d[1], x18 \n\t" + /* Add back state, XOR in message and store */ + "ADD v16.4s, v16.4s, v24.4s \n\t" + "ADD v17.4s, v17.4s, v25.4s \n\t" + "ADD v18.4s, v18.4s, v26.4s \n\t" + "ADD v19.4s, v19.4s, v27.4s \n\t" + "EOR v16.16b, v16.16b, v20.16b \n\t" + "EOR v17.16b, v17.16b, v21.16b \n\t" + "EOR v18.16b, v18.16b, v22.16b \n\t" + "EOR v19.16b, v19.16b, v23.16b \n\t" + "ADD w21, w21, #5 \n\t" + "ST1 {v16.4s-v19.4s}, [%[c]], #64 \n\t" + "SUBS %[bytes], %[bytes], #320 \n\t" + "ADD v28.4s, v28.4s, v29.4s \n\t" + "BNE L_chacha20_arm64_outer_%= \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), + [bytes] "+r" (bytes) + : [L_chacha20_neon_add_all_counters] "r" (L_chacha20_neon_add_all_counters), + [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8) + : "memory", "cc", + "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", + "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" + ); +} +#endif /* __aarch64__ */ + +/** + * Converts word into bytes with rotations having been done. + */ +static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_256\n"); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + __asm__ __volatile__ ( + // v0-v3 - first block + // v12 first block helper + // v4-v7 - second block + // v13 second block helper + // v8-v11 - third block + // v14 third block helper + // w4-w19 - fourth block + + // v0 0 1 2 3 + // v1 4 5 6 7 + // v2 8 9 10 11 + // v3 12 13 14 15 + // load CHACHA state with indices placed as shown above + /* Load state to encrypt */ + "LD1 {v20.4S-v23.4S}, [%[input]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v24.16B}, [%[L_chacha20_neon_rol8]] \n\t" + /* Move state into regular registers */ + "MOV x4, v20.D[0] \n\t" + "MOV x6, v20.D[1] \n\t" + "MOV x8, v21.D[0] \n\t" + "MOV x10, v21.D[1] \n\t" + "MOV x12, v22.D[0] \n\t" + "MOV x14, v22.D[1] \n\t" + "MOV x16, v23.D[0] \n\t" + "MOV x18, v23.D[1] \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v20.16B \n\t" + "MOV v1.16B, v21.16B \n\t" + "LSR x19, x18, #32 \n\t" + "MOV v2.16B, v22.16B \n\t" + "ADD w20, w16, #1 \n\t" + "MOV v3.16B, v23.16B \n\t" + "LSR x17, x16, #32 \n\t" + "MOV v4.16B, v20.16B \n\t" + "MOV v5.16B, v21.16B \n\t" + "LSR x15, x14, #32 \n\t" + "MOV v6.16B, v22.16B \n\t" + "ADD w21, w16, #2 \n\t" + "MOV v7.16B, v23.16B \n\t" + "LSR x13, x12, #32 \n\t" + "MOV v8.16B, v20.16B \n\t" + "MOV v9.16B, v21.16B \n\t" + "LSR x11, x10, #32 \n\t" + "MOV v10.16B, v22.16B \n\t" + "ADD w16, w16, #3 \n\t" + "MOV v11.16B, v23.16B \n\t" + "LSR x9, x8, #32 \n\t" + /* Set counter word */ + "MOV v7.S[0], w20 \n\t" + "LSR x7, x6, #32 \n\t" + "MOV v11.S[0], w21 \n\t" + "LSR x5, x4, #32 \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w3, #10 \n\t" + "\n" + "L_chacha20_arm64_256_loop_%=: \n\t" + "SUBS w3, w3, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD w4, w4, w8 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD w5, w5, w9 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ADD w6, w6, w10 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w7, w7, w11 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR w16, w16, w4 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "EOR w17, w17, w5 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "EOR w18, w18, w6 \n\t" + "REV32 v3.8H, v3.8H \n\t" + "EOR w19, w19, w7 \n\t" + "REV32 v7.8H, v7.8H \n\t" + "ROR w16, w16, #16 \n\t" + "REV32 v11.8H, v11.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ROR w17, w17, #16 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ROR w18, w18, #16 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w19, w19, #16 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ADD w12, w12, w16 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ADD w13, w13, w17 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ADD w14, w14, w18 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w15, w15, w19 \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "EOR w8, w8, w12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "EOR w9, w9, w13 \n\t" + "SHL v9.4S, v14.4S, #12 \n\t" + "EOR w10, w10, w14 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "EOR w11, w11, w15 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + "ROR w8, w8, #20 \n\t" + "SRI v9.4S, v14.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ROR w9, w9, #20 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ROR w10, w10, #20 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ROR w11, w11, #20 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w4, w4, w8 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "ADD w5, w5, w9 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "ADD w6, w6, w10 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "ADD w7, w7, w11 \n\t" + "TBL v3.16B, { v3.16B }, v24.16B \n\t" + "EOR w16, w16, w4 \n\t" + "TBL v7.16B, { v7.16B }, v24.16B \n\t" + "EOR w17, w17, w5 \n\t" + "TBL v11.16B, { v11.16B }, v24.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "EOR w18, w18, w6 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR w19, w19, w7 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w16, w16, #24 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ROR w17, w17, #24 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ROR w18, w18, #24 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ROR w19, w19, #24 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w12, w12, w16 \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "ADD w13, w13, w17 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "ADD w14, w14, w18 \n\t" + "SHL v9.4S, v14.4S, #7 \n\t" + "ADD w15, w15, w19 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EOR w8, w8, w12 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EOR w9, w9, w13 \n\t" + "SRI v9.4S, v14.4S, #25 \n\t" + "EOR w10, w10, w14 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EOR w11, w11, w15 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "ROR w8, w8, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "ROR w9, w9, #25 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #4 \n\t" + "ROR w10, w10, #25 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "ROR w11, w11, #25 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #12 \n\t" + "EXT v9.16B, v9.16B, v9.16B, #4 \n\t" + "EXT v10.16B, v10.16B, v10.16B, #8 \n\t" + "EXT v11.16B, v11.16B, v11.16B, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD w4, w4, w9 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD w5, w5, w10 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ADD w6, w6, w11 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w7, w7, w8 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR w19, w19, w4 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "EOR w16, w16, w5 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "EOR w17, w17, w6 \n\t" + "REV32 v3.8H, v3.8H \n\t" + "EOR w18, w18, w7 \n\t" + "REV32 v7.8H, v7.8H \n\t" + "ROR w19, w19, #16 \n\t" + "REV32 v11.8H, v11.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ROR w16, w16, #16 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ROR w17, w17, #16 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w18, w18, #16 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ADD w14, w14, w19 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ADD w15, w15, w16 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ADD w12, w12, w17 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w13, w13, w18 \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "EOR w9, w9, w14 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "EOR w10, w10, w15 \n\t" + "SHL v9.4S, v14.4S, #12 \n\t" + "EOR w11, w11, w12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "EOR w8, w8, w13 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + "ROR w9, w9, #20 \n\t" + "SRI v9.4S, v14.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ROR w10, w10, #20 \n\t" + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ROR w11, w11, #20 \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "ROR w8, w8, #20 \n\t" + "ADD v8.4S, v8.4S, v9.4S \n\t" + "ADD w4, w4, w9 \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "ADD w5, w5, w10 \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "ADD w6, w6, w11 \n\t" + "EOR v11.16B, v11.16B, v8.16B \n\t" + "ADD w7, w7, w8 \n\t" + "TBL v3.16B, { v3.16B }, v24.16B \n\t" + "EOR w19, w19, w4 \n\t" + "TBL v7.16B, { v7.16B }, v24.16B \n\t" + "EOR w16, w16, w5 \n\t" + "TBL v11.16B, { v11.16B }, v24.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "EOR w17, w17, w6 \n\t" + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR w18, w18, w7 \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "ROR w19, w19, #24 \n\t" + "ADD v10.4S, v10.4S, v11.4S \n\t" + "ROR w16, w16, #24 \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "ROR w17, w17, #24 \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "ROR w18, w18, #24 \n\t" + "EOR v14.16B, v9.16B, v10.16B \n\t" + "ADD w14, w14, w19 \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "ADD w15, w15, w16 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "ADD w12, w12, w17 \n\t" + "SHL v9.4S, v14.4S, #7 \n\t" + "ADD w13, w13, w18 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EOR w9, w9, w14 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EOR w10, w10, w15 \n\t" + "SRI v9.4S, v14.4S, #25 \n\t" + "EOR w11, w11, w12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EOR w8, w8, w13 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "ROR w9, w9, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "ROR w10, w10, #25 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #12 \n\t" + "ROR w11, w11, #25 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "ROR w8, w8, #25 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #4 \n\t" + "EXT v9.16B, v9.16B, v9.16B, #12 \n\t" + "EXT v10.16B, v10.16B, v10.16B, #8 \n\t" + "EXT v11.16B, v11.16B, v11.16B, #4 \n\t" + "BNE L_chacha20_arm64_256_loop_%= \n\t" + /* Load message */ + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + /* Add one (2 added during calculating vector results) */ + "ADD w16, w16, #1 \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v0.4S, v0.4S, v20.4S \n\t" + "ADD v1.4S, v1.4S, v21.4S \n\t" + "ADD v2.4S, v2.4S, v22.4S \n\t" + "ADD v3.4S, v3.4S, v23.4S \n\t" + "EOR v0.16B, v0.16B, v16.16B \n\t" + "EOR v1.16B, v1.16B, v17.16B \n\t" + "EOR v2.16B, v2.16B, v18.16B \n\t" + "EOR v3.16B, v3.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t" + "MOV v23.S[0], w20 \n\t" + "ADD v4.4S, v4.4S, v20.4S \n\t" + "ADD v5.4S, v5.4S, v21.4S \n\t" + "ADD v6.4S, v6.4S, v22.4S \n\t" + "ADD v7.4S, v7.4S, v23.4S \n\t" + "EOR v4.16B, v4.16B, v16.16B \n\t" + "EOR v5.16B, v5.16B, v17.16B \n\t" + "EOR v6.16B, v6.16B, v18.16B \n\t" + "EOR v7.16B, v7.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + "MOV v23.S[0], w21 \n\t" + "ADD v8.4S, v8.4S, v20.4S \n\t" + "ADD v9.4S, v9.4S, v21.4S \n\t" + "ADD v10.4S, v10.4S, v22.4S \n\t" + "ADD v11.4S, v11.4S, v23.4S \n\t" + "EOR v8.16B, v8.16B, v16.16B \n\t" + "EOR v9.16B, v9.16B, v17.16B \n\t" + "EOR v10.16B, v10.16B, v18.16B \n\t" + "EOR v11.16B, v11.16B, v19.16B \n\t" + "LD1 {v16.4S-v19.4S}, [%[m]], #64 \n\t" + "ST1 {v8.4S-v11.4S}, [%[c]], #64 \n\t" + /* Move regular registers into vector registers for adding and xor */ + "ORR x4, x4, x5, lsl #32 \n\t" + "ORR x6, x6, x7, lsl #32 \n\t" + "ORR x8, x8, x9, lsl #32 \n\t" + "MOV v12.D[0], x4 \n\t" + "ORR x10, x10, x11, lsl #32 \n\t" + "MOV v12.D[1], x6 \n\t" + "ORR x12, x12, x13, lsl #32 \n\t" + "MOV v13.D[0], x8 \n\t" + "ORR x14, x14, x15, lsl #32 \n\t" + "MOV v13.D[1], x10 \n\t" + "ORR x16, x16, x17, lsl #32 \n\t" + "MOV v14.D[0], x12 \n\t" + "ORR x18, x18, x19, lsl #32 \n\t" + "MOV v14.D[1], x14 \n\t" + "MOV v15.D[0], x16 \n\t" + "MOV v15.D[1], x18 \n\t" + /* Add back state, XOR in message and store */ + "ADD v12.4S, v12.4S, v20.4S \n\t" + "ADD v13.4S, v13.4S, v21.4S \n\t" + "ADD v14.4S, v14.4S, v22.4S \n\t" + "ADD v15.4S, v15.4S, v23.4S \n\t" + "EOR v12.16B, v12.16B, v16.16B \n\t" + "EOR v13.16B, v13.16B, v17.16B \n\t" + "EOR v14.16B, v14.16B, v18.16B \n\t" + "EOR v15.16B, v15.16B, v19.16B \n\t" + "ST1 {v12.4S-v15.4S}, [%[c]], #64 \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8) + : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9", + "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x18", "x19", "x20", "x21", "v0", "v1", + "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23" + ); +#else + word32 x[CHACHA_CHUNK_WORDS]; + word32* x_addr = x; + __asm__ __volatile__ ( + // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM + // https://cryptojedi.org/papers/neoncrypto-20120320.pdf + + "LDR r14, %[input] \n\t" // load input address + "MOV r11, #1 \n\t" + + "LDM r14, { r0-r12 } \n\t" + "STRD r10, r11, %[x_10] \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 + "VMOV d0, r0, r1 \n\t" + "VMOV d1, r2, r3 \n\t" + "VMOV d2, r4, r5 \n\t" + "VMOV d3, r6, r7 \n\t" + "VMOV d4, r8, r9 \n\t" + "VMOV d5, r10, r11 \n\t" + "VMOV q4, q0 \n\t" + "VMOV q5, q1 \n\t" + "VMOV q6, q2 \n\t" + "VMOV q8, q0 \n\t" + "VMOV q9, q1 \n\t" + "VMOV q10, q2 \n\t" + "LDRD r11, r10, [r14, #4*14] \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 15 14 12 + "VMOV d7, r11, r10 \n\t" + "STR r10, %[x_15] \n\t" + "VMOV d15, r11, r10 \n\t" + "VMOV d23, r11, r10 \n\t" + "MOV r10, r12 \n\t" + "MOV r12, r11 \n\t" + "LDR r11, [r14, #4*13] \n\t" + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + + "MOV r14, %[rounds] \n\t" + + "VMOV d6, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" + "VMOV d14, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" + "VMOV d22, r10, r11 \n\t" + "ADD r10, r10, #1 \n\t" // ARM calculates the fourth block (two was already added earlier) + "\n" + "L_chacha20_arm32_256_loop_%=: \n\t" + "SUBS r14, r14, #1 \n\t" + + // 0, 4, 8, 12 + // 1, 5, 9, 13 + + // ODD ROUND + "ADD r0, r0, r4 \n\t" // 0 0 4 + "VADD.I32 q0, q0, q1 \n\t" + "ADD r1, r1, r5 \n\t" // 1 1 5 + "VADD.I32 q4, q4, q5 \n\t" + "EOR r10, r10, r0 \n\t" // 12 12 0 + "VADD.I32 q8, q8, q9 \n\t" + "EOR r11, r11, r1 \n\t" // 13 13 1 + "VEOR q12, q3, q0 \n\t" + "ROR r10, r10, #16 \n\t" // 12 12 + "VEOR q13, q7, q4 \n\t" + "ROR r11, r11, #16 \n\t" // 13 13 + "VEOR q14, q11, q8 \n\t" + "ADD r8, r8, r10 \n\t" // 8 8 12 + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q12 \n\t" + "ADD r9, r9, r11 \n\t" // 9 9 13 + "VREV32.16 q7, q13 \n\t" + "EOR r4, r4, r8 \n\t" // 4 4 8 + "VREV32.16 q11, q14 \n\t" + + "EOR r5, r5, r9 \n\t" // 5 5 9 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r4, r4, #20 \n\t" // 4 4 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r5, r5, #20 \n\t" // 5 5 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r0, r0, r4 \n\t" // 0 0 4 + "VEOR q12, q1, q2 \n\t" + "ADD r1, r1, r5 \n\t" // 1 1 5 + "VEOR q13, q5, q6 \n\t" + "EOR r10, r10, r0 \n\t" // 12 12 0 + "VEOR q14, q9, q10 \n\t" + "EOR r11, r11, r1 \n\t" // 13 13 1 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #12 \n\t" + "ROR r10, r10, #24 \n\t" // 12 12 + "VSHL.I32 q5, q13, #12 \n\t" + "ROR r11, r11, #24 \n\t" // 13 13 + "VSHL.I32 q9, q14, #12 \n\t" + "ADD r8, r8, r10 \n\t" // 8 8 12 + "VSRI.I32 q1, q12, #20 \n\t" + "ADD r9, r9, r11 \n\t" // 9 9 13 + "VSRI.I32 q5, q13, #20 \n\t" + "STR r11, %[x_13] \n\t" + "VSRI.I32 q9, q14, #20 \n\t" + + "LDR r11, %[x_15] \n\t" + "VADD.I32 q0, q0, q1 \n\t" + "EOR r4, r4, r8 \n\t" // 4 4 8 + "VADD.I32 q4, q4, q5 \n\t" + "STR r8, %[x_8] \n\t" + "VADD.I32 q8, q8, q9 \n\t" + "LDR r8, %[x_10] \n\t" + "VEOR q12, q3, q0 \n\t" + "EOR r5, r5, r9 \n\t" // 5 5 9 + "VEOR q13, q7, q4 \n\t" + "STR r9, %[x_9] \n\t" + "VEOR q14, q11, q8 \n\t" + "LDR r9, %[x_11] \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q12, #8 \n\t" + "ROR r4, r4, #25 \n\t" // 4 4 + "VSHL.I32 q7, q13, #8 \n\t" + "ROR r5, r5, #25 \n\t" // 5 5 + "VSHL.I32 q11, q14, #8 \n\t" + + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 10 11 12 15 14 + + // 2, 6, 10, 14 + // 3, 7, 11, 15 + + "ADD r2, r2, r6 \n\t" // 2 2 6 + "VSRI.I32 q3, q12, #24 \n\t" + "ADD r3, r3, r7 \n\t" // 3 3 7 + "VSRI.I32 q7, q13, #24 \n\t" + "EOR r12, r12, r2 \n\t" // 14 14 2 + "VSRI.I32 q11, q14, #24 \n\t" + + "EOR r11, r11, r3 \n\t" // 15 15 3 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r12, r12, #16 \n\t" // 14 14 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r11, r11, #16 \n\t" // 15 15 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r8, r8, r12 \n\t" // 10 10 14 + "VEOR q12, q1, q2 \n\t" + "ADD r9, r9, r11 \n\t" // 11 11 15 + "VEOR q13, q5, q6 \n\t" + "EOR r6, r6, r8 \n\t" // 6 6 10 + "VEOR q14, q9, q10 \n\t" + "EOR r7, r7, r9 \n\t" // 7 7 11 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #7 \n\t" + "ROR r6, r6, #20 \n\t" // 6 6 + "VSHL.I32 q5, q13, #7 \n\t" + "ROR r7, r7, #20 \n\t" // 7 7 + "VSHL.I32 q9, q14, #7 \n\t" + "ADD r2, r2, r6 \n\t" // 2 2 6 + "VSRI.I32 q1, q12, #25 \n\t" + "ADD r3, r3, r7 \n\t" // 3 3 7 + "VSRI.I32 q5, q13, #25 \n\t" + "EOR r12, r12, r2 \n\t" // 14 14 2 + "VSRI.I32 q9, q14, #25 \n\t" + + // EVEN ROUND + + "EOR r11, r11, r3 \n\t" // 15 15 3 + "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one + "ROR r12, r12, #24 \n\t" // 14 14 + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "ROR r11, r11, #24 \n\t" // 15 15 + "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three + + "ADD r8, r8, r12 \n\t" // 10 10 14 + "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one + "ADD r9, r9, r11 \n\t" // 11 11 15 + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "EOR r6, r6, r8 \n\t" // 6 6 10 + "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three + + "EOR r7, r7, r9 \n\t" // 7 7 11 + "VEXT.8 q9, q9, q9, #4 \n\t" // permute elements left by one + "ROR r6, r6, #25 \n\t" // 6 6 + "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two + "ROR r7, r7, #25 \n\t" // 7 7 + "VEXT.8 q11, q11, q11, #12 \n\t" // permute elements left by three + + // 0, 5, 10, 15 + // 1, 6, 11, 12 + + "ADD r0, r0, r5 \n\t" // 0 0 5 + "VADD.I32 q0, q0, q1 \n\t" + "ADD r1, r1, r6 \n\t" // 1 1 6 + "VADD.I32 q4, q4, q5 \n\t" + "EOR r11, r11, r0 \n\t" // 15 15 0 + "VADD.I32 q8, q8, q9 \n\t" + "EOR r10, r10, r1 \n\t" // 12 12 1 + "VEOR q12, q3, q0 \n\t" + "ROR r11, r11, #16 \n\t" // 15 15 + "VEOR q13, q7, q4 \n\t" + "ROR r10, r10, #16 \n\t" // 12 12 + "VEOR q14, q11, q8 \n\t" + "ADD r8, r8, r11 \n\t" // 10 10 15 + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q12 \n\t" + "ADD r9, r9, r10 \n\t" // 11 11 12 + "VREV32.16 q7, q13 \n\t" + "EOR r5, r5, r8 \n\t" // 5 5 10 + "VREV32.16 q11, q14 \n\t" + + "EOR r6, r6, r9 \n\t" // 6 6 11 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r5, r5, #20 \n\t" // 5 5 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r6, r6, #20 \n\t" // 6 6 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r0, r0, r5 \n\t" // 0 0 5 + "VEOR q12, q1, q2 \n\t" + "ADD r1, r1, r6 \n\t" // 1 1 6 + "VEOR q13, q5, q6 \n\t" + "EOR r11, r11, r0 \n\t" // 15 15 0 + "VEOR q14, q9, q10 \n\t" + "EOR r10, r10, r1 \n\t" // 12 12 1 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #12 \n\t" + "ROR r11, r11, #24 \n\t" // 15 15 + "VSHL.I32 q5, q13, #12 \n\t" + "ROR r10, r10, #24 \n\t" // 12 12 + "VSHL.I32 q9, q14, #12 \n\t" + "ADD r8, r8, r11 \n\t" // 10 10 15 + "VSRI.I32 q1, q12, #20 \n\t" + "STR r11, %[x_15] \n\t" + "VSRI.I32 q5, q13, #20 \n\t" + "LDR r11, %[x_13] \n\t" + "VSRI.I32 q9, q14, #20 \n\t" + + "ADD r9, r9, r10 \n\t" // 11 11 12 + "VADD.I32 q0, q0, q1 \n\t" + "EOR r5, r5, r8 \n\t" // 5 5 10 + "VADD.I32 q4, q4, q5 \n\t" + "STR r8, %[x_10] \n\t" + "VADD.I32 q8, q8, q9 \n\t" + "LDR r8, %[x_8] \n\t" + "VEOR q12, q3, q0 \n\t" + "EOR r6, r6, r9 \n\t" // 6 6 11 + "VEOR q13, q7, q4 \n\t" + "STR r9, %[x_11] \n\t" + "VEOR q14, q11, q8 \n\t" + "LDR r9, %[x_9] \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q12, #8 \n\t" + "ROR r5, r5, #25 \n\t" // 5 5 + "VSHL.I32 q7, q13, #8 \n\t" + "ROR r6, r6, #25 \n\t" // 6 6 + "VSHL.I32 q11, q14, #8 \n\t" + + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + + // 2, 7, 8, 13 + // 3, 4, 9, 14 + + "ADD r2, r2, r7 \n\t" // 2 2 7 + "VSRI.I32 q3, q12, #24 \n\t" + "ADD r3, r3, r4 \n\t" // 3 3 4 + "VSRI.I32 q7, q13, #24 \n\t" + "EOR r11, r11, r2 \n\t" // 13 13 2 + "VSRI.I32 q11, q14, #24 \n\t" + + "EOR r12, r12, r3 \n\t" // 14 14 3 + "VADD.I32 q2, q2, q3 \n\t" + "ROR r11, r11, #16 \n\t" // 13 13 + "VADD.I32 q6, q6, q7 \n\t" + "ROR r12, r12, #16 \n\t" // 14 14 + "VADD.I32 q10, q10, q11 \n\t" + "ADD r8, r8, r11 \n\t" // 8 8 13 + "VEOR q12, q1, q2 \n\t" + "ADD r9, r9, r12 \n\t" // 9 9 14 + "VEOR q13, q5, q6 \n\t" + "EOR r7, r7, r8 \n\t" // 7 7 8 + "VEOR q14, q9, q10 \n\t" + "EOR r4, r4, r9 \n\t" // 4 4 9 + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q12, #7 \n\t" + "ROR r7, r7, #20 \n\t" // 7 7 + "VSHL.I32 q5, q13, #7 \n\t" + "ROR r4, r4, #20 \n\t" // 4 4 + "VSHL.I32 q9, q14, #7 \n\t" + "ADD r2, r2, r7 \n\t" // 2 2 7 + "VSRI.I32 q1, q12, #25 \n\t" + "ADD r3, r3, r4 \n\t" // 3 3 4 + "VSRI.I32 q5, q13, #25 \n\t" + "EOR r11, r11, r2 \n\t" // 13 13 2 + "VSRI.I32 q9, q14, #25 \n\t" + + "EOR r12, r12, r3 \n\t" // 14 14 3 + "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three + "ROR r11, r11, #24 \n\t" // 13 13 + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "ROR r12, r12, #24 \n\t" // 14 14 + "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one + + "ADD r8, r8, r11 \n\t" // 8 8 13 + "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three + "ADD r9, r9, r12 \n\t" // 9 9 14 + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "EOR r7, r7, r8 \n\t" // 7 7 8 + "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one + + "EOR r4, r4, r9 \n\t" // 4 4 9 + "VEXT.8 q9, q9, q9, #12 \n\t" // permute elements left by three + "ROR r7, r7, #25 \n\t" // 7 7 + "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two + "ROR r4, r4, #25 \n\t" // 4 4 + "VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one + + "BNE L_chacha20_arm32_256_loop_%= \n\t" + + "LDR r14, %[x_addr] \n\t" // load address of x to r14 + // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 + // 0 1 2 3 4 5 6 7 8 9 12 13 14 + "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier + "STM r14, { r0-r9 } \n\t" + "STRD r10, r11, [r14, #4*12] \n\t" + "LDR r9, %[input] \n\t" // load input address + "STR r12, [r14, #4*14] \n\t" + "LDR r10, %[c] \n\t" // load c address + + "VLDM r9, { q12-q15 } \n\t" + "LDR r12, %[m] \n\t" // load m address + + "VADD.I32 q0, q0, q12 \n\t" + "VADD.I32 q1, q1, q13 \n\t" + "VADD.I32 q2, q2, q14 \n\t" + "VADD.I32 q3, q3, q15 \n\t" + + "VADD.I32 q4, q4, q12 \n\t" + "VADD.I32 q5, q5, q13 \n\t" + "VADD.I32 q6, q6, q14 \n\t" + "VADD.I32 q7, q7, q15 \n\t" + + "MOV r11, #1 \n\t" + + "VADD.I32 q8, q8, q12 \n\t" + "VMOV.I32 q12, #0 \n\t" + "VADD.I32 q9, q9, q13 \n\t" + "VMOV.I32 d24[0], r11 \n\t" + "VADD.I32 q10, q10, q14 \n\t" + "VADD.I32 q11, q11, q15 \n\t" + + "VADD.I32 q11, q11, q12 \n\t" // add one to counter + "VADD.I32 q7, q7, q12 \n\t" // add one to counter + "VADD.I32 q11, q11, q12 \n\t" // add one to counter + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q0, q0, q12 \n\t" + "VEOR q1, q1, q13 \n\t" + "VEOR q2, q2, q14 \n\t" + "VEOR q3, q3, q15 \n\t" + "VSTM r10!, { q0-q3 } \n\t" // store to c + + "VLDM r14, { q0-q3 } \n\t " // load final block from x + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q4, q4, q12 \n\t" + "VEOR q5, q5, q13 \n\t" + "VEOR q6, q6, q14 \n\t" + "VEOR q7, q7, q15 \n\t" + "VSTM r10!, { q4-q7 } \n\t" // store to c + + "VLDM r9, { q4-q7 } \n\t" // load input + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VEOR q8, q8, q12 \n\t" + "VEOR q9, q9, q13 \n\t" + "VEOR q10, q10, q14 \n\t" + "VEOR q11, q11, q15 \n\t" + "VSTM r10!, { q8-q11 } \n\t" // store to c + + "VLDM r12!, { q12-q15 } \n\t" // load m + "VADD.I32 q0, q0, q4 \n\t" + "VADD.I32 q1, q1, q5 \n\t" + "VADD.I32 q2, q2, q6 \n\t" + "VADD.I32 q3, q3, q7 \n\t" // three was added earlier + "VEOR q0, q0, q12 \n\t" + "VEOR q1, q1, q13 \n\t" + "VEOR q2, q2, q14 \n\t" + "VEOR q3, q3, q15 \n\t" + "VSTM r10!, { q0-q3 } \n\t" // store to c + + : [c] "+m" (c), + [x_0] "=m" (x), + [x_8] "=m" (x[8]), + [x_9] "=m" (x[9]), + [x_10] "=m" (x[10]), + [x_11] "=m" (x[11]), + [x_13] "=m" (x[13]), + [x_15] "=m" (x[15]) + : [rounds] "I" (ROUNDS/2), [input] "m" (input), + [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES), + [m] "m" (m), [x_addr] "m" (x_addr) + : "memory", "cc", + "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r14", + "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); + +#endif /* __aarch64__ */ + return CHACHA_CHUNK_BYTES * 4; +} + + +static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_128\n"); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + __asm__ __volatile__ ( + /* Load incrementer register to modify counter */ + "LD1 {v22.16B}, [%[L_chacha20_neon_inc_first_word]] \n\t" + /* Load index look-up for rotating left 8 bits */ + "LD1 {v23.16B}, [%[L_chacha20_neon_rol8]] \n\t" + /* Load state to encrypt */ + "LD1 {v18.4S-v21.4S}, [%[input]] \n\t" + /* Load message */ + "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v18.16B \n\t" + "MOV v1.16B, v19.16B \n\t" + "MOV v2.16B, v20.16B \n\t" + "MOV v3.16B, v21.16B \n\t" + "MOV v4.16B, v18.16B \n\t" + "MOV v5.16B, v19.16B \n\t" + "MOV v6.16B, v20.16B \n\t" + "MOV v7.16B, v21.16B \n\t" + /* Add counter word */ + "ADD v7.4S, v7.4S, v22.4S \n\t" + /* Set number of odd+even rounds to perform */ + "MOV w3, #10 \n\t" + "\n" + "L_chacha20_arm64_128_loop_%=: \n\t" + "SUBS w3, w3, #1 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + "REV32 v7.8H, v7.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "TBL v3.16B, { v3.16B }, v23.16B \n\t" + "TBL v7.16B, { v7.16B }, v23.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #4 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + "REV32 v7.8H, v7.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SHL v5.4S, v13.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + "SRI v5.4S, v13.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "ADD v4.4S, v4.4S, v5.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "EOR v7.16B, v7.16B, v4.16B \n\t" + "TBL v3.16B, { v3.16B }, v23.16B \n\t" + "TBL v7.16B, { v7.16B }, v23.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "ADD v6.4S, v6.4S, v7.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "EOR v13.16B, v5.16B, v6.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SHL v5.4S, v13.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "SRI v5.4S, v13.4S, #25 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v5.16B, v5.16B, v5.16B, #12 \n\t" + "EXT v6.16B, v6.16B, v6.16B, #8 \n\t" + "EXT v7.16B, v7.16B, v7.16B, #4 \n\t" + "BNE L_chacha20_arm64_128_loop_%= \n\t" + /* Add back state, XOR in message and store (load next block) */ + "ADD v0.4S, v0.4S, v18.4S \n\t" + "ADD v1.4S, v1.4S, v19.4S \n\t" + "ADD v2.4S, v2.4S, v20.4S \n\t" + "ADD v3.4S, v3.4S, v21.4S \n\t" + "EOR v0.16B, v0.16B, v14.16B \n\t" + "EOR v1.16B, v1.16B, v15.16B \n\t" + "EOR v2.16B, v2.16B, v16.16B \n\t" + "EOR v3.16B, v3.16B, v17.16B \n\t" + "LD1 {v14.4S-v17.4S}, [%[m]], #64 \n\t" + "ST1 {v0.4S-v3.4S}, [%[c]], #64 \n\t" + "ADD v21.4S, v21.4S, v22.4S \n\t" + "ADD v4.4S, v4.4S, v18.4S \n\t" + "ADD v5.4S, v5.4S, v19.4S \n\t" + "ADD v6.4S, v6.4S, v20.4S \n\t" + "ADD v7.4S, v7.4S, v21.4S \n\t" + "EOR v4.16B, v4.16B, v14.16B \n\t" + "EOR v5.16B, v5.16B, v15.16B \n\t" + "EOR v6.16B, v6.16B, v16.16B \n\t" + "EOR v7.16B, v7.16B, v17.16B \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8), + [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21" + ); +#else + __asm__ __volatile__ ( + "MOV r11, %[rounds] \n\t" + "MOV r12, #1 \n\t" + "VLDM %[input], { q0-q3 } \n\t" + "VMOV.I32 q8, #0 \n\t" + "VMOV q4, q0 \n\t" + "VMOV.I32 d16[0], r12 \n\t" + "VMOV q5, q1 \n\t" + "VMOV q6, q2 \n\t" + "VADD.I32 q7, q3, q8 \n\t" // add one to counter + + // store input + "VMOV q10, q0 \n\t" + "VMOV q11, q1 \n\t" + "VMOV q12, q2 \n\t" + "VMOV q13, q3 \n\t" + "\n" + "L_chacha20_arm32_128_loop_%=: \n\t" + "SUBS r11, r11, #1 \n\t" + + // ODD ROUND + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q8 \n\t" + "VREV32.16 q7, q9 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #12 \n\t" + "VSHL.I32 q5, q9, #12 \n\t" + "VSRI.I32 q1, q8, #20 \n\t" + "VSRI.I32 q5, q9, #20 \n\t" + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q8, #8 \n\t" + "VSHL.I32 q7, q9, #8 \n\t" + "VSRI.I32 q3, q8, #24 \n\t" + "VSRI.I32 q7, q9, #24 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #7 \n\t" + "VSHL.I32 q5, q9, #7 \n\t" + "VSRI.I32 q1, q8, #25 \n\t" + "VSRI.I32 q5, q9, #25 \n\t" + + // EVEN ROUND + + "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three + + "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words + "VREV32.16 q3, q8 \n\t" + "VREV32.16 q7, q9 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #12 \n\t" + "VSHL.I32 q5, q9, #12 \n\t" + "VSRI.I32 q1, q8, #20 \n\t" + "VSRI.I32 q5, q9, #20 \n\t" + + "VADD.I32 q0, q0, q1 \n\t" + "VADD.I32 q4, q4, q5 \n\t" + "VEOR q8, q3, q0 \n\t" + "VEOR q9, q7, q4 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q3, q8, #8 \n\t" + "VSHL.I32 q7, q9, #8 \n\t" + "VSRI.I32 q3, q8, #24 \n\t" + "VSRI.I32 q7, q9, #24 \n\t" + + "VADD.I32 q2, q2, q3 \n\t" + "VADD.I32 q6, q6, q7 \n\t" + "VEOR q8, q1, q2 \n\t" + "VEOR q9, q5, q6 \n\t" + // SIMD instructions don't support rotation so we have to cheat using shifts and a help register + "VSHL.I32 q1, q8, #7 \n\t" + "VSHL.I32 q5, q9, #7 \n\t" + "VSRI.I32 q1, q8, #25 \n\t" + "VSRI.I32 q5, q9, #25 \n\t" + + "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three + "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two + "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one + + "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three + "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two + "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one + + "BNE L_chacha20_arm32_128_loop_%= \n\t" + + "VMOV.I32 q8, #0 \n\t" + "VADD.I32 q0, q0, q10 \n\t" + "VADD.I32 q1, q1, q11 \n\t" + "VMOV.I32 d16[0], r12 \n\t" + "VADD.I32 q2, q2, q12 \n\t" + "VADD.I32 q3, q3, q13 \n\t" + + "VADD.I32 q13, q13, q8 \n\t" // add one to counter + + "VADD.I32 q4, q4, q10 \n\t" + "VADD.I32 q5, q5, q11 \n\t" + "VADD.I32 q6, q6, q12 \n\t" + "VADD.I32 q7, q7, q13 \n\t" + + "VLDM %[m], { q8-q15 } \n\t" + "VEOR q0, q0, q8 \n\t" + "VEOR q1, q1, q9 \n\t" + "VEOR q2, q2, q10 \n\t" + "VEOR q3, q3, q11 \n\t" + "VEOR q4, q4, q12 \n\t" + "VEOR q5, q5, q13 \n\t" + "VEOR q6, q6, q14 \n\t" + "VEOR q7, q7, q15 \n\t" + "VSTM %[c], { q0-q7 } \n\t" + + : [c] "+r" (c), [m] "+r" (m) + : [rounds] "I" (ROUNDS/2), [input] "r" (input), + [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES) + : "memory", "cc", + "r11", "r12", + "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif /* __aarch64__ */ + return CHACHA_CHUNK_BYTES * 2; +} + +static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m, + byte* c, word32 bytes) +{ +#ifdef CHACHA_TEST + printf("Entering wc_Chacha_encrypt_64 with %d bytes\n", bytes); +#endif /*CHACHA_TEST */ + +#ifdef __aarch64__ + __asm__ __volatile__ ( + /* Load index look-up for rotating left 8 bits */ + "LD1 {v13.16B}, [%[L_chacha20_neon_rol8]] \n\t" + "LD1 {v14.4S}, [%[L_chacha20_neon_inc_first_word]] \n\t" + /* Load state to encrypt */ + "LD1 {v8.4S-v11.4S}, [%[input]] \n\t" + "\n" + "L_chacha20_arm64_64_loop_%=: \n\t" + /* Move state into vector registers (x3) */ + "MOV v0.16B, v8.16B \n\t" + "MOV v1.16B, v9.16B \n\t" + "MOV v2.16B, v10.16B \n\t" + "MOV v3.16B, v11.16B \n\t" + /* Add counter word */ + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #12 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #4 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "REV32 v3.8H, v3.8H \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #12 \n\t" + "SRI v1.4S, v12.4S, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "ADD v0.4S, v0.4S, v1.4S \n\t" + "EOR v3.16B, v3.16B, v0.16B \n\t" + "TBL v3.16B, { v3.16B }, v13.16B \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "ADD v2.4S, v2.4S, v3.4S \n\t" + "EOR v12.16B, v1.16B, v2.16B \n\t" + "SHL v1.4S, v12.4S, #7 \n\t" + "SRI v1.4S, v12.4S, #25 \n\t" + "EXT v3.16B, v3.16B, v3.16B, #4 \n\t" + "EXT v1.16B, v1.16B, v1.16B, #12 \n\t" + "EXT v2.16B, v2.16B, v2.16B, #8 \n\t" + /* Add back state */ + "ADD v0.4S, v0.4S, v8.4S \n\t" + "ADD v1.4S, v1.4S, v9.4S \n\t" + "ADD v2.4S, v2.4S, v10.4S \n\t" + "ADD v3.4S, v3.4S, v11.4S \n\t" + "CMP %[bytes], #64 \n\t" + "BLT L_chacha20_arm64_64_lt_64_%= \n\t" + "LD1 {v4.4S-v7.4S}, [%[m]], #64 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "EOR v5.16B, v5.16B, v1.16B \n\t" + "EOR v6.16B, v6.16B, v2.16B \n\t" + "EOR v7.16B, v7.16B, v3.16B \n\t" + "ST1 {v4.4S-v7.4S}, [%[c]], #64 \n\t" + "SUBS %[bytes], %[bytes], #64 \n\t" + "ADD v11.4S, v11.4S, v14.4S \n\t" + "BNE L_chacha20_arm64_64_loop_%= \n\t" + "B L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_64_%=: \n\t" + "CMP %[bytes], #32 \n\t" + "BLT L_chacha20_arm64_64_lt_32_%= \n\t" + "LD1 {v4.4S, v5.4S}, [%[m]], #32 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "EOR v5.16B, v5.16B, v1.16B \n\t" + "ST1 {v4.4S, v5.4S}, [%[c]], #32 \n\t" + "SUBS %[bytes], %[bytes], #32 \n\t" + "MOV v0.16B, v2.16B \n\t" + "MOV v1.16B, v3.16B \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_32_%=: \n\t" + "CMP %[bytes], #16 \n\t" + "BLT L_chacha20_arm64_64_lt_16_%= \n\t" + "LD1 {v4.4S}, [%[m]], #16 \n\t" + "EOR v4.16B, v4.16B, v0.16B \n\t" + "ST1 {v4.4S}, [%[c]], #16 \n\t" + "SUBS %[bytes], %[bytes], #16 \n\t" + "MOV v0.16B, v1.16B \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_16_%=: \n\t" + "CMP %[bytes], #8 \n\t" + "BLT L_chacha20_arm64_64_lt_8_%= \n\t" + "LD1 {v4.2S}, [%[m]], #8 \n\t" + "EOR v4.8B, v4.8B, v0.8B \n\t" + "ST1 {v4.2S}, [%[c]], #8 \n\t" + "SUBS %[bytes], %[bytes], #8 \n\t" + "MOV v0.D[0], v0.D[1] \n\t" + "BEQ L_chacha20_arm64_64_done_%= \n\t" + "\n" + "L_chacha20_arm64_64_lt_8_%=: \n\t" + "MOV x4, v0.D[0] \n\t" + "LSL x5, %[bytes], #3 \n\t" + "\n" + "L_chacha20_arm64_64_loop_lt_8_%=: \n\t" + "LDRB w6, [%[m], %[bytes]] \n\t" + "ROR x7, x4, x5 \n\t" + "EOR w6, w6, w7 \n\t" + "STRB w6, [%[c], %[bytes]] \n\t" + "SUBS %[bytes], %[bytes], #1 \n\t" + "SUB x5, x5, #8 \n\t" + "BGE L_chacha20_arm64_64_loop_lt_8_%= \n\t" + "\n" + "L_chacha20_arm64_64_done_%=: \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes) + : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8), + [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" + ); +#else + __asm__ __volatile__ ( + /* Get the input state */ + "VLDM %[input], { q8-q11 } \n\t" + /* Get the incrementer register */ + "VLDM %[L_chacha20_neon_inc_first_word], { q14 } \n\t" + "\n" + "L_chacha20_arm32_64_outer_loop_%=: \n\t" + /* Copy over the input state */ + "VMOV q0, q8 \n\t" + "VMOV q1, q9 \n\t" + "VMOV q2, q10 \n\t" + "VMOV q3, q11 \n\t" + /* Compute quarter rounds */ + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Odd Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Odd->Even */ + "VEXT.8 q1, q1, q1, #4 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #12 \n\t" + /* Even Round */ + /* a += b; d ^= a; d <<<= 16; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VREV32.16 q3, q4 \n\t" + /* c += d; b ^= c; b <<<= 12; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #12 \n\t" + "VSRI.I32 q1, q4, #20 \n\t" + /* a += b; d ^= a; d <<<= 8; */ + "VADD.I32 q0, q0, q1 \n\t" + "VEOR q4, q3, q0 \n\t" + "VSHL.I32 q3, q4, #8 \n\t" + "VSRI.I32 q3, q4, #24 \n\t" + /* c += d; b ^= c; b <<<= 7; */ + "VADD.I32 q2, q2, q3 \n\t" + "VEOR q4, q1, q2 \n\t" + "VSHL.I32 q1, q4, #7 \n\t" + "VSRI.I32 q1, q4, #25 \n\t" + /* Permute Even->Odd */ + "VEXT.8 q1, q1, q1, #12 \n\t" + "VEXT.8 q2, q2, q2, #8 \n\t" + "VEXT.8 q3, q3, q3, #4 \n\t" + /* Add back state */ + "VADD.I32 q0, q0, q8 \n\t" + "VADD.I32 q1, q1, q9 \n\t" + "VADD.I32 q2, q2, q10 \n\t" + "VADD.I32 q3, q3, q11 \n\t" + "CMP %[bytes], #64 \n\t" + "BLT L_chacha20_arm32_64_lt_64_%= \n\t" + /* XOR full 64 byte block */ + "VLDM %[m], { q4-q7 } \n\t" + "ADD %[m], %[m], #64 \n\t" + "VEOR q0, q0, q4 \n\t" + "VEOR q1, q1, q5 \n\t" + "VEOR q2, q2, q6 \n\t" + "VEOR q3, q3, q7 \n\t" + "VSTM %[c], { q0-q3 } \n\t" + "ADD %[c], %[c], #64 \n\t" + "SUBS %[bytes], %[bytes], #64 \n\t" + "VADD.I32 q11, q11, q14 \n\t" + "BNE L_chacha20_arm32_64_outer_loop_%= \n\t" + "B L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_64_%=: \n\t" + /* XOR 32 bytes */ + "CMP %[bytes], #32 \n\t" + "BLT L_chacha20_arm32_64_lt_32_%= \n\t" + "VLDM %[m], { q4-q5 } \n\t" + "ADD %[m], %[m], #32 \n\t" + "VEOR q4, q4, q0 \n\t" + "VEOR q5, q5, q1 \n\t" + "VSTM %[c], { q4-q5 } \n\t" + "ADD %[c], %[c], #32 \n\t" + "SUBS %[bytes], %[bytes], #32 \n\t" + "VMOV q0, q2 \n\t" + "VMOV q1, q3 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_32_%=: \n\t" + /* XOR 16 bytes */ + "CMP %[bytes], #16 \n\t" + "BLT L_chacha20_arm32_64_lt_16_%= \n\t" + "VLDM %[m], { q4 } \n\t" + "ADD %[m], %[m], #16 \n\t" + "VEOR q4, q4, q0 \n\t" + "VSTM %[c], { q4 } \n\t" + "ADD %[c], %[c], #16 \n\t" + "SUBS %[bytes], %[bytes], #16 \n\t" + "VMOV q0, q1 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_16_%=: \n\t" + /* XOR 8 bytes */ + "CMP %[bytes], #8 \n\t" + "BLT L_chacha20_arm32_64_lt_8_%= \n\t" + "VLDR d8, [%[m], #0] \n\t" + "ADD %[m], %[m], #8 \n\t" + "VEOR d8, d8, d0 \n\t" + "VSTR d8, [%[c], #0] \n\t" + "ADD %[c], %[c], #8 \n\t" + "SUBS %[bytes], %[bytes], #8 \n\t" + "VMOV d0, d1 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_8_%=: \n\t" + /* XOR 4 bytes */ + "CMP %[bytes], #4 \n\t" + "BLT L_chacha20_arm32_64_lt_4_%= \n\t" + "LDR r12, [%[m]], #4 \n\t" + "VMOV r14, d0[0] \n\t" + "EOR r12, r12, r14 \n\t" + "STR r12, [%[c]], #4 \n\t" + "SUBS %[bytes], %[bytes], #4 \n\t" + "VTRN.32 d0, d0 \n\t" + "BEQ L_chacha20_arm32_64_done_%= \n\t" + "\n" + "L_chacha20_arm32_64_lt_4_%=: \n\t" + /* XOR remaining bytes */ + "VMOV r14, d0[0] \n\t" + "\n" + "L_chacha20_arm32_64_lt_4_loop_%=: \n\t" + "LDRB r12, [%[m]], #1 \n\t" + "EOR r12, r12, r14 \n\t" + "STRB r12, [%[c]], #1 \n\t" + "SUBS %[bytes], %[bytes], #1 \n\t" + "LSR r14, r14, #8 \n\t" + "BGT L_chacha20_arm32_64_lt_4_loop_%= \n\t" + "\n" + "L_chacha20_arm32_64_done_%=: \n\t" + : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes) + : [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word) + : "memory", "cc", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q14", "r12", "r14" + ); +#endif /* __aarch64__ */ +} + +/** + * Encrypt a stream of bytes + */ +static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, + word32 bytes) +{ + int processed; + +#ifdef __aarch64__ + if (bytes >= CHACHA_CHUNK_BYTES * 5) { + processed = (bytes / (CHACHA_CHUNK_BYTES * 5)) * CHACHA_CHUNK_BYTES * 5; + wc_Chacha_encrypt_320(ctx->X, m, c, processed); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes >= CHACHA_CHUNK_BYTES * 4) { +#else + while (bytes >= CHACHA_CHUNK_BYTES * 4) { +#endif /*__aarch64__ */ + processed = wc_Chacha_encrypt_256(ctx->X, m, c); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes >= CHACHA_CHUNK_BYTES * 2) { + processed = wc_Chacha_encrypt_128(ctx->X, m, c); + + bytes -= processed; + c += processed; + m += processed; + ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); + } + if (bytes > 0) { + wc_Chacha_encrypt_64(ctx->X, m, c, bytes); + if (bytes > 64) + ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); + ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); + } +} + +/** + * API to encrypt/decrypt a message of any size. + */ +int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, + word32 msglen) +{ + if (ctx == NULL || output == NULL || input == NULL) + return BAD_FUNC_ARG; + + wc_Chacha_encrypt_bytes(ctx, input, output, msglen); + + return 0; +} + +#endif /* HAVE_CHACHA*/ + +#endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 30ff0813a..95f5da049 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -4363,14 +4363,138 @@ int chacha_test(void) const byte* keys[] = {key1, key2, key3, key4}; - static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; - static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; - static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01}; - static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00}; + static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; const byte* ivs[] = {ivs1, ivs2, ivs3, ivs4}; +#ifndef BENCH_EMBEDDED + static const byte cipher_big_result[] = { + 0x06, 0xa6, 0x5d, 0x31, 0x21, 0x6c, 0xdb, 0x37, 0x48, 0x7c, 0x01, 0x9d, + 0x72, 0xdf, 0x0a, 0x5b, 0x64, 0x74, 0x20, 0xba, 0x9e, 0xe0, 0x26, 0x7a, + 0xbf, 0xdf, 0x83, 0x34, 0x3b, 0x4f, 0x94, 0x3f, 0x37, 0x89, 0xaf, 0x00, + 0xdf, 0x0f, 0x2e, 0x75, 0x16, 0x41, 0xf6, 0x7a, 0x86, 0x94, 0x9d, 0x32, + 0x56, 0xf0, 0x79, 0x71, 0x68, 0x6f, 0xa6, 0x6b, 0xc6, 0x59, 0x49, 0xf6, + 0x10, 0x34, 0x03, 0x03, 0x16, 0x53, 0x9a, 0x98, 0x2a, 0x46, 0xde, 0x17, + 0x06, 0x65, 0x70, 0xca, 0x0a, 0x1f, 0xab, 0x80, 0x26, 0x96, 0x3f, 0x3e, + 0x7a, 0x3c, 0xa8, 0x87, 0xbb, 0x65, 0xdd, 0x5e, 0x07, 0x7b, 0x34, 0xe0, + 0x56, 0xda, 0x32, 0x13, 0x30, 0xc9, 0x0c, 0xd7, 0xba, 0xe4, 0x1f, 0xa6, + 0x91, 0x4f, 0x72, 0x9f, 0xd9, 0x5c, 0x62, 0x7d, 0xa6, 0xc2, 0xbc, 0x87, + 0xae, 0x64, 0x11, 0x94, 0x3b, 0xbc, 0x6c, 0x23, 0xbd, 0x7d, 0x00, 0xb4, + 0x99, 0xf2, 0x68, 0xb5, 0x59, 0x70, 0x93, 0xad, 0x69, 0xd0, 0xb1, 0x28, + 0x70, 0x92, 0xeb, 0xec, 0x39, 0x80, 0x82, 0xde, 0x44, 0xe2, 0x8a, 0x26, + 0xb3, 0xe9, 0x45, 0xcf, 0x83, 0x76, 0x9f, 0x6a, 0xa0, 0x46, 0x4a, 0x3d, + 0x26, 0x56, 0xaf, 0x49, 0x41, 0x26, 0x1b, 0x6a, 0x41, 0x37, 0x65, 0x91, + 0x72, 0xc4, 0xe7, 0x3c, 0x17, 0x31, 0xae, 0x2e, 0x2b, 0x31, 0x45, 0xe4, + 0x93, 0xd3, 0x10, 0xaa, 0xc5, 0x62, 0xd5, 0x11, 0x4b, 0x57, 0x1d, 0xad, + 0x48, 0x06, 0xd0, 0x0d, 0x98, 0xa5, 0xc6, 0x5b, 0xd0, 0x9e, 0x22, 0xc0, + 0x00, 0x32, 0x5a, 0xf5, 0x1c, 0x89, 0x6d, 0x54, 0x97, 0x55, 0x6b, 0x46, + 0xc5, 0xc7, 0xc4, 0x48, 0x9c, 0xbf, 0x47, 0xdc, 0x03, 0xc4, 0x1b, 0xcb, + 0x65, 0xa6, 0x91, 0x9d, 0x6d, 0xf1, 0xb0, 0x7a, 0x4d, 0x3b, 0x03, 0x95, + 0xf4, 0x8b, 0x0b, 0xae, 0x39, 0xff, 0x3f, 0xf6, 0xc0, 0x14, 0x18, 0x8a, + 0xe5, 0x19, 0xbd, 0xc1, 0xb4, 0x05, 0x4e, 0x29, 0x2f, 0x0b, 0x33, 0x76, + 0x28, 0x16, 0xa4, 0xa6, 0x93, 0x04, 0xb5, 0x55, 0x6b, 0x89, 0x3d, 0xa5, + 0x0f, 0xd3, 0xad, 0xfa, 0xd9, 0xfd, 0x05, 0x5d, 0x48, 0x94, 0x25, 0x5a, + 0x2c, 0x9a, 0x94, 0x80, 0xb0, 0xe7, 0xcb, 0x4d, 0x77, 0xbf, 0xca, 0xd8, + 0x55, 0x48, 0xbd, 0x66, 0xb1, 0x85, 0x81, 0xb1, 0x37, 0x79, 0xab, 0x52, + 0x08, 0x14, 0x12, 0xac, 0xcd, 0x45, 0x4d, 0x53, 0x6b, 0xca, 0x96, 0xc7, + 0x3b, 0x2f, 0x73, 0xb1, 0x5a, 0x23, 0xbd, 0x65, 0xd5, 0xea, 0x17, 0xb3, + 0xdc, 0xa1, 0x17, 0x1b, 0x2d, 0xb3, 0x9c, 0xd0, 0xdb, 0x41, 0x77, 0xef, + 0x93, 0x20, 0x52, 0x3e, 0x9d, 0xf5, 0xbf, 0x33, 0xf7, 0x52, 0xc1, 0x90, + 0xa0, 0x15, 0x17, 0xce, 0xf7, 0xf7, 0xd0, 0x3a, 0x3b, 0xd1, 0x72, 0x56, + 0x31, 0x81, 0xae, 0x60, 0xab, 0x40, 0xc1, 0xd1, 0x28, 0x77, 0x53, 0xac, + 0x9f, 0x11, 0x0a, 0x88, 0x36, 0x4b, 0xda, 0x57, 0xa7, 0x28, 0x5c, 0x85, + 0xd3, 0x85, 0x9b, 0x79, 0xad, 0x05, 0x1c, 0x37, 0x14, 0x5e, 0x0d, 0xd0, + 0x23, 0x03, 0x42, 0x1d, 0x48, 0x5d, 0xc5, 0x3c, 0x5a, 0x08, 0xa9, 0x0d, + 0x6e, 0x82, 0x7c, 0x2e, 0x3c, 0x41, 0xcc, 0x96, 0x8e, 0xad, 0xee, 0x2a, + 0x61, 0x0b, 0x16, 0x0f, 0xa9, 0x24, 0x40, 0x85, 0xbc, 0x9f, 0x28, 0x8d, + 0xe6, 0x68, 0x4d, 0x8f, 0x30, 0x48, 0xd9, 0x73, 0x73, 0x6c, 0x9a, 0x7f, + 0x67, 0xf7, 0xde, 0x4c, 0x0a, 0x8b, 0xe4, 0xb3, 0x08, 0x2a, 0x52, 0xda, + 0x54, 0xee, 0xcd, 0xb5, 0x62, 0x4a, 0x26, 0x20, 0xfb, 0x40, 0xbb, 0x39, + 0x3a, 0x0f, 0x09, 0xe8, 0x00, 0xd1, 0x24, 0x97, 0x60, 0xe9, 0x83, 0x83, + 0xfe, 0x9f, 0x9c, 0x15, 0xcf, 0x69, 0x03, 0x9f, 0x03, 0xe1, 0xe8, 0x6e, + 0xbd, 0x87, 0x58, 0x68, 0xee, 0xec, 0xd8, 0x29, 0x46, 0x23, 0x49, 0x92, + 0x72, 0x95, 0x5b, 0x49, 0xca, 0xe0, 0x45, 0x59, 0xb2, 0xca, 0xf4, 0xfc, + 0xb7, 0x59, 0x37, 0x49, 0x28, 0xbc, 0xf3, 0xd7, 0x61, 0xbc, 0x4b, 0xf3, + 0xa9, 0x4b, 0x2f, 0x05, 0xa8, 0x01, 0xa5, 0xdc, 0x00, 0x6e, 0x01, 0xb6, + 0x45, 0x3c, 0xd5, 0x49, 0x7d, 0x5c, 0x25, 0xe8, 0x31, 0x87, 0xb2, 0xb9, + 0xbf, 0xb3, 0x01, 0x62, 0x0c, 0xd0, 0x48, 0x77, 0xa2, 0x34, 0x0f, 0x16, + 0x22, 0x28, 0xee, 0x54, 0x08, 0x93, 0x3b, 0xe4, 0xde, 0x7e, 0x63, 0xf7, + 0x97, 0x16, 0x5d, 0x71, 0x58, 0xc2, 0x2e, 0xf2, 0x36, 0xa6, 0x12, 0x65, + 0x94, 0x17, 0xac, 0x66, 0x23, 0x7e, 0xc6, 0x72, 0x79, 0x24, 0xce, 0x8f, + 0x55, 0x19, 0x97, 0x44, 0xfc, 0x55, 0xec, 0x85, 0x26, 0x27, 0xdb, 0x38, + 0xb1, 0x42, 0x0a, 0xdd, 0x05, 0x99, 0x28, 0xeb, 0x03, 0x6c, 0x9a, 0xe9, + 0x17, 0xf6, 0x2c, 0xb0, 0xfe, 0xe7, 0xa4, 0xa7, 0x31, 0xda, 0x4d, 0xb0, + 0x29, 0xdb, 0xdd, 0x8d, 0x12, 0x13, 0x9c, 0xb4, 0xcc, 0x83, 0x97, 0xfb, + 0x1a, 0xdc, 0x08, 0xd6, 0x30, 0x62, 0xe8, 0xeb, 0x8b, 0x61, 0xcb, 0x1d, + 0x06, 0xe3, 0xa5, 0x4d, 0x35, 0xdb, 0x59, 0xa8, 0x2d, 0x87, 0x27, 0x44, + 0x6f, 0xc0, 0x38, 0x97, 0xe4, 0x85, 0x00, 0x02, 0x09, 0xf6, 0x69, 0x3a, + 0xcf, 0x08, 0x1b, 0x21, 0xbb, 0x79, 0xb1, 0xa1, 0x34, 0x09, 0xe0, 0x80, + 0xca, 0xb0, 0x78, 0x8a, 0x11, 0x97, 0xd4, 0x07, 0xbe, 0x1b, 0x6a, 0x5d, + 0xdb, 0xd6, 0x1f, 0x76, 0x6b, 0x16, 0xf0, 0x58, 0x84, 0x5f, 0x59, 0xce, + 0x62, 0x34, 0xc3, 0xdf, 0x94, 0xb8, 0x2f, 0x84, 0x68, 0xf0, 0xb8, 0x51, + 0xd9, 0x6d, 0x8e, 0x4a, 0x1d, 0xe6, 0x5c, 0xd8, 0x86, 0x25, 0xe3, 0x24, + 0xfd, 0x21, 0x61, 0x13, 0x48, 0x3e, 0xf6, 0x7d, 0xa6, 0x71, 0x9b, 0xd2, + 0x6e, 0xe6, 0xd2, 0x08, 0x94, 0x62, 0x6c, 0x98, 0xfe, 0x2f, 0x9c, 0x88, + 0x7e, 0x78, 0x15, 0x02, 0x00, 0xf0, 0xba, 0x24, 0x91, 0xf2, 0xdc, 0x47, + 0x51, 0x4d, 0x15, 0x5e, 0x91, 0x5f, 0x57, 0x5b, 0x1d, 0x35, 0x24, 0x45, + 0x75, 0x9b, 0x88, 0x75, 0xf1, 0x2f, 0x85, 0xe7, 0x89, 0xd1, 0x01, 0xb4, + 0xc8, 0x18, 0xb7, 0x97, 0xef, 0x4b, 0x90, 0xf4, 0xbf, 0x10, 0x27, 0x3c, + 0x60, 0xff, 0xc4, 0x94, 0x20, 0x2f, 0x93, 0x4b, 0x4d, 0xe3, 0x80, 0xf7, + 0x2c, 0x71, 0xd9, 0xe3, 0x68, 0xb4, 0x77, 0x2b, 0xc7, 0x0d, 0x39, 0x92, + 0xef, 0x91, 0x0d, 0xb2, 0x11, 0x50, 0x0e, 0xe8, 0xad, 0x3b, 0xf6, 0xb5, + 0xc6, 0x14, 0x4d, 0x33, 0x53, 0xa7, 0x60, 0x15, 0xc7, 0x27, 0x51, 0xdc, + 0x54, 0x29, 0xa7, 0x0d, 0x6a, 0x7b, 0x72, 0x13, 0xad, 0x7d, 0x41, 0x19, + 0x4e, 0x42, 0x49, 0xcc, 0x42, 0xe4, 0xbd, 0x99, 0x13, 0xd9, 0x7f, 0xf3, + 0x38, 0xa4, 0xb6, 0x33, 0xed, 0x07, 0x48, 0x7e, 0x8e, 0x82, 0xfe, 0x3a, + 0x9d, 0x75, 0x93, 0xba, 0x25, 0x4e, 0x37, 0x3c, 0x0c, 0xd5, 0x69, 0xa9, + 0x2d, 0x9e, 0xfd, 0xe8, 0xbb, 0xf5, 0x0c, 0xe2, 0x86, 0xb9, 0x5e, 0x6f, + 0x28, 0xe4, 0x19, 0xb3, 0x0b, 0xa4, 0x86, 0xd7, 0x24, 0xd0, 0xb8, 0x89, + 0x7b, 0x76, 0xec, 0x05, 0x10, 0x5b, 0x68, 0xe9, 0x58, 0x66, 0xa3, 0xc5, + 0xb6, 0x63, 0x20, 0x0e, 0x0e, 0xea, 0x3d, 0x61, 0x5e, 0xda, 0x3d, 0x3c, + 0xf9, 0xfd, 0xed, 0xa9, 0xdb, 0x52, 0x94, 0x8a, 0x00, 0xca, 0x3c, 0x8d, + 0x66, 0x8f, 0xb0, 0xf0, 0x5a, 0xca, 0x3f, 0x63, 0x71, 0xbf, 0xca, 0x99, + 0x37, 0x9b, 0x75, 0x97, 0x89, 0x10, 0x6e, 0xcf, 0xf2, 0xf5, 0xe3, 0xd5, + 0x45, 0x9b, 0xad, 0x10, 0x71, 0x6c, 0x5f, 0x6f, 0x7f, 0x22, 0x77, 0x18, + 0x2f, 0xf9, 0x99, 0xc5, 0x69, 0x58, 0x03, 0x12, 0x86, 0x82, 0x3e, 0xbf, + 0xc2, 0x12, 0x35, 0x43, 0xa3, 0xd9, 0x18, 0x4f, 0x41, 0x11, 0x6b, 0xf3, + 0x67, 0xaf, 0x3d, 0x78, 0xe4, 0x22, 0x2d, 0xb3, 0x48, 0x43, 0x31, 0x1d, + 0xef, 0xa8, 0xba, 0x49, 0x8e, 0xa9, 0xa7, 0xb6, 0x18, 0x77, 0x84, 0xca, + 0xbd, 0xa2, 0x02, 0x1b, 0x6a, 0xf8, 0x5f, 0xda, 0xff, 0xcf, 0x01, 0x6a, + 0x86, 0x69, 0xa9, 0xe9, 0xcb, 0x60, 0x1e, 0x15, 0xdc, 0x8f, 0x5d, 0x39, + 0xb5, 0xce, 0x55, 0x5f, 0x47, 0x97, 0xb1, 0x19, 0x6e, 0x21, 0xd6, 0x13, + 0x39, 0xb2, 0x24, 0xe0, 0x62, 0x82, 0x9f, 0xed, 0x12, 0x81, 0xed, 0xee, + 0xab, 0xd0, 0x2f, 0x19, 0x89, 0x3f, 0x57, 0x2e, 0xc2, 0xe2, 0x67, 0xe8, + 0xae, 0x03, 0x56, 0xba, 0xd4, 0xd0, 0xa4, 0x89, 0x03, 0x06, 0x5b, 0xcc, + 0xf2, 0x22, 0xb8, 0x0e, 0x76, 0x79, 0x4a, 0x42, 0x1d, 0x37, 0x51, 0x5a, + 0xaa, 0x46, 0x6c, 0x2a, 0xdd, 0x66, 0xfe, 0xc6, 0x68, 0xc3, 0x38, 0xa2, + 0xae, 0x5b, 0x98, 0x24, 0x5d, 0x43, 0x05, 0x82, 0x38, 0x12, 0xd3, 0xd1, + 0x75, 0x2d, 0x4f, 0x61, 0xbd, 0xb9, 0x10, 0x87, 0x44, 0x2a, 0x78, 0x07, + 0xff, 0xf4, 0x0f, 0xa1, 0xf3, 0x68, 0x9f, 0xbe, 0xae, 0xa2, 0x91, 0xf0, + 0xc7, 0x55, 0x7a, 0x52, 0xd5, 0xa3, 0x8d, 0x6f, 0xe4, 0x90, 0x5c, 0xf3, + 0x5f, 0xce, 0x3d, 0x23, 0xf9, 0x8e, 0xae, 0x14, 0xfb, 0x82, 0x9a, 0xa3, + 0x04, 0x5f, 0xbf, 0xad, 0x3e, 0xf2, 0x97, 0x0a, 0x60, 0x40, 0x70, 0x19, + 0x72, 0xad, 0x66, 0xfb, 0x78, 0x1b, 0x84, 0x6c, 0x98, 0xbc, 0x8c, 0xf8, + 0x4f, 0xcb, 0xb5, 0xf6, 0xaf, 0x7a, 0xb7, 0x93, 0xef, 0x67, 0x48, 0x02, + 0x2c, 0xcb, 0xe6, 0x77, 0x0f, 0x7b, 0xc1, 0xee, 0xc5, 0xb6, 0x2d, 0x7e, + 0x62, 0xa0, 0xc0, 0xa7, 0xa5, 0x80, 0x31, 0x92, 0x50, 0xa1, 0x28, 0x22, + 0x95, 0x03, 0x17, 0xd1, 0x0f, 0xf6, 0x08, 0xe5, 0xec + }; +#define CHACHA_BIG_TEST_SIZE 1305 +#ifndef WOLFSSL_SMALL_STACK + byte cipher_big[CHACHA_BIG_TEST_SIZE] = {0}; + byte plain_big[CHACHA_BIG_TEST_SIZE] = {0}; + byte input_big[CHACHA_BIG_TEST_SIZE] = {0}; +#else + byte* cipher_big; + byte* plain_big; + byte* input_big; +#endif /* WOLFSSL_SMALL_STACK */ + int block_size; +#endif /* BENCH_EMBEDDED */ byte a[] = {0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90}; byte b[] = {0x45,0x40,0xf0,0x5a,0x9f,0x1f,0xb2,0x96}; @@ -4384,6 +4508,26 @@ int chacha_test(void) test_chacha[2] = c; test_chacha[3] = d; +#ifndef BENCH_EMBEDDED +#ifdef WOLFSSL_SMALL_STACK + cipher_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (cipher_big == NULL) { + return MEMORY_E; + } + plain_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (plain_big == NULL) { + return MEMORY_E; + } + input_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (input_big == NULL) { + return MEMORY_E; + } + XMEMSET(cipher_big, 0, sizeof(CHACHA_BIG_TEST_SIZE)); + XMEMSET(plain_big, 0, sizeof(CHACHA_BIG_TEST_SIZE)); + XMEMSET(input_big, 0, sizeof(CHACHA_BIG_TEST_SIZE)); +#endif /* WOLFSSL_SMALL_STACK */ +#endif /* BENCH_EMBEDDED */ + for (i = 0; i < times; ++i) { if (i < 3) { keySz = 32; @@ -4444,6 +4588,66 @@ int chacha_test(void) if (XMEMCMP(plain + 64, sliver, 64)) return -4320; +#ifndef BENCH_EMBEDDED + /* test of encrypting more data */ + keySz = 32; + + ret |= wc_Chacha_SetKey(&enc, keys[0], keySz); + ret |= wc_Chacha_SetKey(&dec, keys[0], keySz); + if (ret != 0) + return ret; + + ret |= wc_Chacha_SetIV(&enc, ivs[2], 0); + ret |= wc_Chacha_SetIV(&dec, ivs[2], 0); + if (ret != 0) + return ret; + + ret |= wc_Chacha_Process(&enc, cipher_big, plain_big, CHACHA_BIG_TEST_SIZE); + ret |= wc_Chacha_Process(&dec, plain_big, cipher_big, CHACHA_BIG_TEST_SIZE); + if (ret != 0) + return ret; + + if (XMEMCMP(plain_big, input_big, sizeof(input_big))) + return -4330; + + if (XMEMCMP(cipher_big, cipher_big_result, CHACHA_BIG_TEST_SIZE)) + return -4331; + + for (i = 0; i < 18; ++i) { + /* this will test all paths */ + // block sizes: 1 2 3 4 7 8 15 16 31 32 63 64 127 128 255 256 511 512 + block_size = (2 << (i%9)) - (i<9?1:0); + keySz = 32; + + ret |= wc_Chacha_SetKey(&enc, keys[0], keySz); + ret |= wc_Chacha_SetKey(&dec, keys[0], keySz); + if (ret != 0) + return ret; + + ret |= wc_Chacha_SetIV(&enc, ivs[2], 0); + ret |= wc_Chacha_SetIV(&dec, ivs[2], 0); + if (ret != 0) + return ret; + + ret |= wc_Chacha_Process(&enc, cipher_big, plain_big, block_size); + ret |= wc_Chacha_Process(&dec, plain_big, cipher_big, block_size); + if (ret != 0) + return ret; + + if (XMEMCMP(plain_big, input_big, block_size)) + return -4340-i; + + if (XMEMCMP(cipher_big, cipher_big_result, block_size)) + return -4360-i; + } + +#ifdef WOLFSSL_SMALL_STACK + XFREE(cipher_big, NULL, DYNAMIC_TYPE_TMP_BUFFER); + XFREE(plain_big, NULL, DYNAMIC_TYPE_TMP_BUFFER); + XFREE(input_big, NULL, DYNAMIC_TYPE_TMP_BUFFER); +#endif /* WOLFSSL_SMALL_STACK */ +#endif /* BENCH_EMBEDDED */ + return 0; } #endif /* HAVE_CHACHA */ From 540c5cdd2fb0c112d1be17efbe79d9f9006273c0 Mon Sep 17 00:00:00 2001 From: David Garske Date: Mon, 10 Jun 2019 20:33:28 +0200 Subject: [PATCH 19/21] Updated benchmarks with SPI "dual mode" I/O enabled. --- IDE/ECLIPSE/SIFIVE/README.md | 62 ++++++++++++++++++------------------ IDE/ECLIPSE/SIFIVE/main.c | 5 +++ 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/IDE/ECLIPSE/SIFIVE/README.md b/IDE/ECLIPSE/SIFIVE/README.md index 5ce2271bc..3e7e39303 100644 --- a/IDE/ECLIPSE/SIFIVE/README.md +++ b/IDE/ECLIPSE/SIFIVE/README.md @@ -140,37 +140,37 @@ Actual Clock 320MHz wolfSSL version 4.0.0 ------------------------------------------------------------------------------ wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each) -RNG 200 KB took 1.044 seconds, 191.519 KB/s -AES-128-CBC-enc 50 KB took 1.657 seconds, 30.174 KB/s -AES-128-CBC-dec 50 KB took 1.659 seconds, 30.141 KB/s -AES-192-CBC-enc 50 KB took 1.837 seconds, 27.220 KB/s -AES-192-CBC-dec 50 KB took 1.839 seconds, 27.194 KB/s -AES-256-CBC-enc 25 KB took 1.009 seconds, 24.784 KB/s -AES-256-CBC-dec 25 KB took 1.010 seconds, 24.761 KB/s -AES-128-GCM-enc 25 KB took 1.493 seconds, 16.739 KB/s -AES-128-GCM-dec 25 KB took 1.564 seconds, 15.986 KB/s -AES-192-GCM-enc 25 KB took 1.591 seconds, 15.716 KB/s -AES-192-GCM-dec 25 KB took 1.662 seconds, 15.044 KB/s -AES-256-GCM-enc 25 KB took 1.684 seconds, 14.843 KB/s -AES-256-GCM-dec 25 KB took 1.755 seconds, 14.245 KB/s -CHACHA 1 MB took 1.004 seconds, 0.997 MB/s -CHA-POLY 675 KB took 1.021 seconds, 661.060 KB/s -POLY1305 2 MB took 1.007 seconds, 2.230 MB/s -SHA 1 MB took 1.016 seconds, 1.321 MB/s -SHA-256 425 KB took 1.005 seconds, 422.909 KB/s -SHA-512 225 KB took 1.009 seconds, 223.073 KB/s -HMAC-SHA 1 MB took 1.010 seconds, 1.378 MB/s -HMAC-SHA256 425 KB took 1.037 seconds, 409.781 KB/s -HMAC-SHA512 25 KB took 2.075 seconds, 12.050 KB/s -ECC 256 key gen 2 ops took 1.099 sec, avg 549.271 ms, 1.821 ops/sec -ECDHE 256 agree 2 ops took 1.093 sec, avg 546.555 ms, 1.830 ops/sec -ECDSA 256 sign 2 ops took 1.167 sec, avg 583.694 ms, 1.713 ops/sec -ECDSA 256 verify 2 ops took 2.136 sec, avg 1067.795 ms, 0.937 ops/sec -CURVE 25519 key gen 2 ops took 1.693 sec, avg 846.451 ms, 1.181 ops/sec -CURVE 25519 agree 2 ops took 1.689 sec, avg 844.299 ms, 1.184 ops/sec -ED 25519 key gen 1 ops took 1.702 sec, avg 1702.057 ms, 0.588 ops/sec -ED 25519 sign 2 ops took 3.650 sec, avg 1824.753 ms, 0.548 ops/sec -ED 25519 verify 2 ops took 5.788 sec, avg 2894.012 ms, 0.346 ops/sec +RNG 250 KB took 1.098 seconds, 227.714 KB/s +AES-128-CBC-enc 50 KB took 1.132 seconds, 44.175 KB/s +AES-128-CBC-dec 50 KB took 1.142 seconds, 43.778 KB/s +AES-192-CBC-enc 50 KB took 1.250 seconds, 40.007 KB/s +AES-192-CBC-dec 50 KB took 1.260 seconds, 39.677 KB/s +AES-256-CBC-enc 50 KB took 1.368 seconds, 36.552 KB/s +AES-256-CBC-dec 50 KB took 1.378 seconds, 36.279 KB/s +AES-128-GCM-enc 25 KB took 1.225 seconds, 20.412 KB/s +AES-128-GCM-dec 25 KB took 1.225 seconds, 20.402 KB/s +AES-192-GCM-enc 25 KB took 1.290 seconds, 19.373 KB/s +AES-192-GCM-dec 25 KB took 1.291 seconds, 19.366 KB/s +AES-256-GCM-enc 25 KB took 1.352 seconds, 18.487 KB/s +AES-256-GCM-dec 25 KB took 1.353 seconds, 18.478 KB/s +CHACHA 1 MB took 1.006 seconds, 1.020 MB/s +CHA-POLY 700 KB took 1.032 seconds, 678.045 KB/s +POLY1305 2 MB took 1.007 seconds, 2.255 MB/s +SHA 2 MB took 1.002 seconds, 1.511 MB/s +SHA-256 525 KB took 1.011 seconds, 519.279 KB/s +SHA-512 275 KB took 1.017 seconds, 270.477 KB/s +HMAC-SHA 1 MB took 1.013 seconds, 1.399 MB/s +HMAC-SHA256 525 KB took 1.019 seconds, 515.020 KB/s +HMAC-SHA512 275 KB took 1.032 seconds, 266.351 KB/s +ECC 256 key gen 2 ops took 1.104 sec, avg 551.834 ms, 1.812 ops/sec +ECDHE 256 agree 2 ops took 1.101 sec, avg 550.400 ms, 1.817 ops/sec +ECDSA 256 sign 2 ops took 1.173 sec, avg 586.502 ms, 1.705 ops/sec +ECDSA 256 verify 2 ops took 2.153 sec, avg 1076.294 ms, 0.929 ops/sec +CURVE 25519 key gen 2 ops took 1.629 sec, avg 814.423 ms, 1.228 ops/sec +CURVE 25519 agree 2 ops took 1.626 sec, avg 813.156 ms, 1.230 ops/sec +ED 25519 key gen 1 ops took 1.436 sec, avg 1436.096 ms, 0.696 ops/sec +ED 25519 sign 2 ops took 2.913 sec, avg 1456.421 ms, 0.687 ops/sec +ED 25519 verify 2 ops took 5.012 sec, avg 2506.012 ms, 0.399 ops/sec Benchmark complete ``` diff --git a/IDE/ECLIPSE/SIFIVE/main.c b/IDE/ECLIPSE/SIFIVE/main.c index 50c398ae5..dc33ac163 100644 --- a/IDE/ECLIPSE/SIFIVE/main.c +++ b/IDE/ECLIPSE/SIFIVE/main.c @@ -150,6 +150,11 @@ int main(void) #endif printf("Actual Clock %dMHz\n", clk_Hz/1000000); + /* Reconfigure the SPI Bus for dual mode */ + #define QSPI0_CTRL 0x10014000UL + #define FESPI_REG_FFMT (*((volatile uint32_t *)(QSPI0_CTRL + 0x64))) + FESPI_REG_FFMT = 0xbb1447; + #ifdef DEBUG_WOLFSSL wolfSSL_Debugging_ON(); #endif From 48d4ed38ac995a6f522d0bde9af87c54c309f499 Mon Sep 17 00:00:00 2001 From: Tesfa Mael Date: Mon, 10 Jun 2019 14:06:42 -0700 Subject: [PATCH 20/21] Include SIFIVE files for distribution --- IDE/include.am | 1 + 1 file changed, 1 insertion(+) diff --git a/IDE/include.am b/IDE/include.am index 205ee6a35..a70a88fef 100644 --- a/IDE/include.am +++ b/IDE/include.am @@ -18,6 +18,7 @@ include IDE/GCC-ARM/include.am include IDE/CSBENCH/include.am include IDE/ECLIPSE/DEOS/include.am include IDE/ECLIPSE/MICRIUM/include.am +include IDE/ECLIPSE/SIFIVE/include.am include IDE/mynewt/include.am include IDE/Renesas/cs+/Projects/include.am include IDE/Renesas/e2studio/Projects/include.am From 911c8df185957698c1cf0a75c975fee0f439e36c Mon Sep 17 00:00:00 2001 From: Juliusz Sosinowicz Date: Wed, 12 Jun 2019 00:52:42 +0200 Subject: [PATCH 21/21] Fix 256 byte assembly slowdown --- wolfcrypt/src/port/arm/armv8-chacha.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-chacha.c b/wolfcrypt/src/port/arm/armv8-chacha.c index 8eebc0334..76487d683 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha.c +++ b/wolfcrypt/src/port/arm/armv8-chacha.c @@ -974,11 +974,10 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM // https://cryptojedi.org/papers/neoncrypto-20120320.pdf + ".align 2 \n\t" "LDR r14, %[input] \n\t" // load input address - "MOV r11, #1 \n\t" "LDM r14, { r0-r12 } \n\t" - "STRD r10, r11, %[x_10] \n\t" // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // 0 1 2 3 4 5 6 7 8 9 10 11 12 "VMOV d0, r0, r1 \n\t" @@ -986,14 +985,15 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS "VMOV d2, r4, r5 \n\t" "VMOV d3, r6, r7 \n\t" "VMOV d4, r8, r9 \n\t" + "STRD r10, r11, %[x_10] \n\t" "VMOV d5, r10, r11 \n\t" + "LDRD r11, r10, [r14, #4*14] \n\t" "VMOV q4, q0 \n\t" "VMOV q5, q1 \n\t" "VMOV q6, q2 \n\t" "VMOV q8, q0 \n\t" "VMOV q9, q1 \n\t" "VMOV q10, q2 \n\t" - "LDRD r11, r10, [r14, #4*14] \n\t" // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 // 0 1 2 3 4 5 6 7 8 9 15 14 12 "VMOV d7, r11, r10 \n\t"