mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-07-05 13:20:52 +02:00
Merge pull request #10744 from SparkiDev/arm64_asm_opt
ARM64 ASM: optimizations
This commit is contained in:
@@ -44518,37 +44518,32 @@ L_AES_CTR_encrypt_NEON_loop_4:
|
||||
ld1 {v4.2d}, [x9], #16
|
||||
mov v8.d[1], x10
|
||||
mov v8.d[0], x11
|
||||
rev64 v8.16b, v8.16b
|
||||
rev32 v8.16b, v8.16b
|
||||
rev64 v8.4s, v8.4s
|
||||
# Round: 0 - XOR in key schedule
|
||||
eor v0.16b, v8.16b, v4.16b
|
||||
adds x10, x10, #1
|
||||
adc x11, x11, xzr
|
||||
mov v8.d[1], x10
|
||||
mov v8.d[0], x11
|
||||
rev64 v8.16b, v8.16b
|
||||
rev32 v8.16b, v8.16b
|
||||
rev64 v8.4s, v8.4s
|
||||
eor v1.16b, v8.16b, v4.16b
|
||||
adds x10, x10, #1
|
||||
adc x11, x11, xzr
|
||||
mov v8.d[1], x10
|
||||
mov v8.d[0], x11
|
||||
rev64 v8.16b, v8.16b
|
||||
rev32 v8.16b, v8.16b
|
||||
rev64 v8.4s, v8.4s
|
||||
eor v2.16b, v8.16b, v4.16b
|
||||
adds x10, x10, #1
|
||||
adc x11, x11, xzr
|
||||
mov v8.d[1], x10
|
||||
mov v8.d[0], x11
|
||||
rev64 v8.16b, v8.16b
|
||||
rev32 v8.16b, v8.16b
|
||||
rev64 v8.4s, v8.4s
|
||||
eor v3.16b, v8.16b, v4.16b
|
||||
adds x10, x10, #1
|
||||
adc x11, x11, xzr
|
||||
mov v8.d[1], x10
|
||||
mov v8.d[0], x11
|
||||
rev64 v8.16b, v8.16b
|
||||
rev32 v8.16b, v8.16b
|
||||
rev64 v8.4s, v8.4s
|
||||
sub w8, w4, #2
|
||||
L_AES_CTR_encrypt_NEON_loop_nr_4:
|
||||
tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b
|
||||
@@ -44947,8 +44942,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_4:
|
||||
bge L_AES_CTR_encrypt_NEON_loop_4
|
||||
mov v2.d[1], x10
|
||||
mov v2.d[0], x11
|
||||
rev64 v2.16b, v2.16b
|
||||
rev32 v2.16b, v2.16b
|
||||
rev64 v2.4s, v2.4s
|
||||
L_AES_CTR_encrypt_NEON_start_2:
|
||||
movi v12.16b, #0x40
|
||||
movi v13.16b, #0x80
|
||||
@@ -44966,15 +44960,13 @@ L_AES_CTR_encrypt_NEON_loop_2:
|
||||
adc x11, x11, xzr
|
||||
mov v2.d[1], x10
|
||||
mov v2.d[0], x11
|
||||
rev64 v2.16b, v2.16b
|
||||
rev32 v2.16b, v2.16b
|
||||
rev64 v2.4s, v2.4s
|
||||
eor v1.16b, v2.16b, v4.16b
|
||||
adds x10, x10, #1
|
||||
adc x11, x11, xzr
|
||||
mov v2.d[1], x10
|
||||
mov v2.d[0], x11
|
||||
rev64 v2.16b, v2.16b
|
||||
rev32 v2.16b, v2.16b
|
||||
rev64 v2.4s, v2.4s
|
||||
sub w8, w4, #2
|
||||
L_AES_CTR_encrypt_NEON_loop_nr_2:
|
||||
eor v8.16b, v0.16b, v12.16b
|
||||
@@ -45291,8 +45283,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_1:
|
||||
adc x11, x11, xzr
|
||||
mov v2.d[1], x10
|
||||
mov v2.d[0], x11
|
||||
rev64 v2.16b, v2.16b
|
||||
rev32 v2.16b, v2.16b
|
||||
rev64 v2.4s, v2.4s
|
||||
L_AES_CTR_encrypt_NEON_data_done:
|
||||
rev32 v2.16b, v2.16b
|
||||
st1 {v2.2d}, [x5]
|
||||
@@ -49790,8 +49781,7 @@ _AES_XTS_decrypt_NEON:
|
||||
mov x17, #0x87
|
||||
ands w19, w2, #15
|
||||
cset w16, ne
|
||||
lsl w16, w16, #4
|
||||
sub w2, w2, w16
|
||||
sub w2, w2, w16, lsl 4
|
||||
ld1 {v2.2d}, [x3]
|
||||
ld1 {v4.2d}, [x5]
|
||||
rev32 v2.16b, v2.16b
|
||||
@@ -51689,11 +51679,10 @@ L_AES_set_encrypt_key_loop_256:
|
||||
stp w6, w7, [x2]
|
||||
stnp w8, w9, [x2, #8]
|
||||
sub x2, x2, #16
|
||||
mov w3, w9
|
||||
ubfx w6, w3, #8, #8
|
||||
ubfx w7, w3, #16, #8
|
||||
ubfx w8, w3, #24, #8
|
||||
ubfx w3, w3, #0, #8
|
||||
ubfx w6, w9, #8, #8
|
||||
ubfx w7, w9, #16, #8
|
||||
ubfx w8, w9, #24, #8
|
||||
ubfx w3, w9, #0, #8
|
||||
lsl w6, w6, #2
|
||||
lsl w7, w7, #2
|
||||
lsl w8, w8, #2
|
||||
@@ -55634,8 +55623,7 @@ _AES_XTS_decrypt:
|
||||
#endif /* __APPLE__ */
|
||||
ands w11, w2, #15
|
||||
cset w11, ne
|
||||
lsl w11, w11, #4
|
||||
sub w2, w2, w11
|
||||
sub w2, w2, w11, lsl 4
|
||||
mov x11, #0x87
|
||||
mov x28, x5
|
||||
ldp x23, x24, [x3]
|
||||
|
||||
@@ -44787,37 +44787,32 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
|
||||
"ld1 {v4.2d}, [x9], #16\n\t"
|
||||
"mov v8.d[1], x10\n\t"
|
||||
"mov v8.d[0], x11\n\t"
|
||||
"rev64 v8.16b, v8.16b\n\t"
|
||||
"rev32 v8.16b, v8.16b\n\t"
|
||||
"rev64 v8.4s, v8.4s\n\t"
|
||||
/* Round: 0 - XOR in key schedule */
|
||||
"eor v0.16b, v8.16b, v4.16b\n\t"
|
||||
"adds x10, x10, #1\n\t"
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v8.d[1], x10\n\t"
|
||||
"mov v8.d[0], x11\n\t"
|
||||
"rev64 v8.16b, v8.16b\n\t"
|
||||
"rev32 v8.16b, v8.16b\n\t"
|
||||
"rev64 v8.4s, v8.4s\n\t"
|
||||
"eor v1.16b, v8.16b, v4.16b\n\t"
|
||||
"adds x10, x10, #1\n\t"
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v8.d[1], x10\n\t"
|
||||
"mov v8.d[0], x11\n\t"
|
||||
"rev64 v8.16b, v8.16b\n\t"
|
||||
"rev32 v8.16b, v8.16b\n\t"
|
||||
"rev64 v8.4s, v8.4s\n\t"
|
||||
"eor v2.16b, v8.16b, v4.16b\n\t"
|
||||
"adds x10, x10, #1\n\t"
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v8.d[1], x10\n\t"
|
||||
"mov v8.d[0], x11\n\t"
|
||||
"rev64 v8.16b, v8.16b\n\t"
|
||||
"rev32 v8.16b, v8.16b\n\t"
|
||||
"rev64 v8.4s, v8.4s\n\t"
|
||||
"eor v3.16b, v8.16b, v4.16b\n\t"
|
||||
"adds x10, x10, #1\n\t"
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v8.d[1], x10\n\t"
|
||||
"mov v8.d[0], x11\n\t"
|
||||
"rev64 v8.16b, v8.16b\n\t"
|
||||
"rev32 v8.16b, v8.16b\n\t"
|
||||
"rev64 v8.4s, v8.4s\n\t"
|
||||
"sub w8, %w[nr], #2\n\t"
|
||||
"\n"
|
||||
"L_AES_CTR_encrypt_NEON_loop_nr_4_%=:\n\t"
|
||||
@@ -45217,8 +45212,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
|
||||
"b.ge L_AES_CTR_encrypt_NEON_loop_4_%=\n\t"
|
||||
"mov v2.d[1], x10\n\t"
|
||||
"mov v2.d[0], x11\n\t"
|
||||
"rev64 v2.16b, v2.16b\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
"rev64 v2.4s, v2.4s\n\t"
|
||||
"\n"
|
||||
"L_AES_CTR_encrypt_NEON_start_2_%=:\n\t"
|
||||
"movi v12.16b, #0x40\n\t"
|
||||
@@ -45238,15 +45232,13 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v2.d[1], x10\n\t"
|
||||
"mov v2.d[0], x11\n\t"
|
||||
"rev64 v2.16b, v2.16b\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
"rev64 v2.4s, v2.4s\n\t"
|
||||
"eor v1.16b, v2.16b, v4.16b\n\t"
|
||||
"adds x10, x10, #1\n\t"
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v2.d[1], x10\n\t"
|
||||
"mov v2.d[0], x11\n\t"
|
||||
"rev64 v2.16b, v2.16b\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
"rev64 v2.4s, v2.4s\n\t"
|
||||
"sub w8, %w[nr], #2\n\t"
|
||||
"\n"
|
||||
"L_AES_CTR_encrypt_NEON_loop_nr_2_%=:\n\t"
|
||||
@@ -45566,8 +45558,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
|
||||
"adc x11, x11, xzr\n\t"
|
||||
"mov v2.d[1], x10\n\t"
|
||||
"mov v2.d[0], x11\n\t"
|
||||
"rev64 v2.16b, v2.16b\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
"rev64 v2.4s, v2.4s\n\t"
|
||||
"\n"
|
||||
"L_AES_CTR_encrypt_NEON_data_done_%=:\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
@@ -49928,8 +49919,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
|
||||
"mov x17, #0x87\n\t"
|
||||
"ands w19, %w[sz], #15\n\t"
|
||||
"cset w16, ne\n\t"
|
||||
"lsl w16, w16, #4\n\t"
|
||||
"sub %w[sz], %w[sz], w16\n\t"
|
||||
"sub %w[sz], %w[sz], w16, lsl 4\n\t"
|
||||
"ld1 {v2.2d}, [%x[i]]\n\t"
|
||||
"ld1 {v4.2d}, [%x[key2]]\n\t"
|
||||
"rev32 v2.16b, v2.16b\n\t"
|
||||
@@ -51770,11 +51760,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
|
||||
"stp w6, w7, [%x[ks]]\n\t"
|
||||
"stnp w8, w9, [%x[ks], #8]\n\t"
|
||||
"sub %x[ks], %x[ks], #16\n\t"
|
||||
"mov w3, w9\n\t"
|
||||
"ubfx w6, w3, #8, #8\n\t"
|
||||
"ubfx w7, w3, #16, #8\n\t"
|
||||
"ubfx w8, w3, #24, #8\n\t"
|
||||
"ubfx w3, w3, #0, #8\n\t"
|
||||
"ubfx w6, w9, #8, #8\n\t"
|
||||
"ubfx w7, w9, #16, #8\n\t"
|
||||
"ubfx w8, w9, #24, #8\n\t"
|
||||
"ubfx w3, w9, #0, #8\n\t"
|
||||
"lsl w6, w6, #2\n\t"
|
||||
"lsl w7, w7, #2\n\t"
|
||||
"lsl w8, w8, #2\n\t"
|
||||
@@ -55552,8 +55541,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
|
||||
__asm__ __volatile__ (
|
||||
"ands w11, %w[sz], #15\n\t"
|
||||
"cset w11, ne\n\t"
|
||||
"lsl w11, w11, #4\n\t"
|
||||
"sub %w[sz], %w[sz], w11\n\t"
|
||||
"sub %w[sz], %w[sz], w11, lsl 4\n\t"
|
||||
"mov x11, #0x87\n\t"
|
||||
"mov x28, %x[key2]\n\t"
|
||||
"ldp x23, x24, [%x[i]]\n\t"
|
||||
|
||||
@@ -1101,8 +1101,7 @@ L_chacha_use_over_arm64_byte_loop:
|
||||
eor w5, w5, w4
|
||||
subs x3, x3, #1
|
||||
strb w5, [x1], #1
|
||||
beq L_chacha_use_over_arm64_done
|
||||
b L_chacha_use_over_arm64_byte_loop
|
||||
bne L_chacha_use_over_arm64_byte_loop
|
||||
L_chacha_use_over_arm64_done:
|
||||
ret
|
||||
#ifndef __APPLE__
|
||||
|
||||
@@ -1024,8 +1024,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
|
||||
"eor w5, w5, w4\n\t"
|
||||
"subs %w[len], %w[len], #1\n\t"
|
||||
"strb w5, [%x[output]], #1\n\t"
|
||||
"b.eq L_chacha_use_over_arm64_done_%=\n\t"
|
||||
"b L_chacha_use_over_arm64_byte_loop_%=\n\t"
|
||||
"b.ne L_chacha_use_over_arm64_byte_loop_%=\n\t"
|
||||
"\n"
|
||||
"L_chacha_use_over_arm64_done_%=:\n\t"
|
||||
: [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)
|
||||
|
||||
Reference in New Issue
Block a user