Merge pull request #10744 from SparkiDev/arm64_asm_opt

ARM64 ASM: optimizations
This commit is contained in:
David Garske
2026-06-30 08:42:23 -07:00
committed by GitHub
4 changed files with 32 additions and 58 deletions
+15 -27
View File
@@ -44518,37 +44518,32 @@ L_AES_CTR_encrypt_NEON_loop_4:
ld1 {v4.2d}, [x9], #16
mov v8.d[1], x10
mov v8.d[0], x11
rev64 v8.16b, v8.16b
rev32 v8.16b, v8.16b
rev64 v8.4s, v8.4s
# Round: 0 - XOR in key schedule
eor v0.16b, v8.16b, v4.16b
adds x10, x10, #1
adc x11, x11, xzr
mov v8.d[1], x10
mov v8.d[0], x11
rev64 v8.16b, v8.16b
rev32 v8.16b, v8.16b
rev64 v8.4s, v8.4s
eor v1.16b, v8.16b, v4.16b
adds x10, x10, #1
adc x11, x11, xzr
mov v8.d[1], x10
mov v8.d[0], x11
rev64 v8.16b, v8.16b
rev32 v8.16b, v8.16b
rev64 v8.4s, v8.4s
eor v2.16b, v8.16b, v4.16b
adds x10, x10, #1
adc x11, x11, xzr
mov v8.d[1], x10
mov v8.d[0], x11
rev64 v8.16b, v8.16b
rev32 v8.16b, v8.16b
rev64 v8.4s, v8.4s
eor v3.16b, v8.16b, v4.16b
adds x10, x10, #1
adc x11, x11, xzr
mov v8.d[1], x10
mov v8.d[0], x11
rev64 v8.16b, v8.16b
rev32 v8.16b, v8.16b
rev64 v8.4s, v8.4s
sub w8, w4, #2
L_AES_CTR_encrypt_NEON_loop_nr_4:
tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b
@@ -44947,8 +44942,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_4:
bge L_AES_CTR_encrypt_NEON_loop_4
mov v2.d[1], x10
mov v2.d[0], x11
rev64 v2.16b, v2.16b
rev32 v2.16b, v2.16b
rev64 v2.4s, v2.4s
L_AES_CTR_encrypt_NEON_start_2:
movi v12.16b, #0x40
movi v13.16b, #0x80
@@ -44966,15 +44960,13 @@ L_AES_CTR_encrypt_NEON_loop_2:
adc x11, x11, xzr
mov v2.d[1], x10
mov v2.d[0], x11
rev64 v2.16b, v2.16b
rev32 v2.16b, v2.16b
rev64 v2.4s, v2.4s
eor v1.16b, v2.16b, v4.16b
adds x10, x10, #1
adc x11, x11, xzr
mov v2.d[1], x10
mov v2.d[0], x11
rev64 v2.16b, v2.16b
rev32 v2.16b, v2.16b
rev64 v2.4s, v2.4s
sub w8, w4, #2
L_AES_CTR_encrypt_NEON_loop_nr_2:
eor v8.16b, v0.16b, v12.16b
@@ -45291,8 +45283,7 @@ L_AES_CTR_encrypt_NEON_loop_nr_1:
adc x11, x11, xzr
mov v2.d[1], x10
mov v2.d[0], x11
rev64 v2.16b, v2.16b
rev32 v2.16b, v2.16b
rev64 v2.4s, v2.4s
L_AES_CTR_encrypt_NEON_data_done:
rev32 v2.16b, v2.16b
st1 {v2.2d}, [x5]
@@ -49790,8 +49781,7 @@ _AES_XTS_decrypt_NEON:
mov x17, #0x87
ands w19, w2, #15
cset w16, ne
lsl w16, w16, #4
sub w2, w2, w16
sub w2, w2, w16, lsl 4
ld1 {v2.2d}, [x3]
ld1 {v4.2d}, [x5]
rev32 v2.16b, v2.16b
@@ -51689,11 +51679,10 @@ L_AES_set_encrypt_key_loop_256:
stp w6, w7, [x2]
stnp w8, w9, [x2, #8]
sub x2, x2, #16
mov w3, w9
ubfx w6, w3, #8, #8
ubfx w7, w3, #16, #8
ubfx w8, w3, #24, #8
ubfx w3, w3, #0, #8
ubfx w6, w9, #8, #8
ubfx w7, w9, #16, #8
ubfx w8, w9, #24, #8
ubfx w3, w9, #0, #8
lsl w6, w6, #2
lsl w7, w7, #2
lsl w8, w8, #2
@@ -55634,8 +55623,7 @@ _AES_XTS_decrypt:
#endif /* __APPLE__ */
ands w11, w2, #15
cset w11, ne
lsl w11, w11, #4
sub w2, w2, w11
sub w2, w2, w11, lsl 4
mov x11, #0x87
mov x28, x5
ldp x23, x24, [x3]
+15 -27
View File
@@ -44787,37 +44787,32 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
"ld1 {v4.2d}, [x9], #16\n\t"
"mov v8.d[1], x10\n\t"
"mov v8.d[0], x11\n\t"
"rev64 v8.16b, v8.16b\n\t"
"rev32 v8.16b, v8.16b\n\t"
"rev64 v8.4s, v8.4s\n\t"
/* Round: 0 - XOR in key schedule */
"eor v0.16b, v8.16b, v4.16b\n\t"
"adds x10, x10, #1\n\t"
"adc x11, x11, xzr\n\t"
"mov v8.d[1], x10\n\t"
"mov v8.d[0], x11\n\t"
"rev64 v8.16b, v8.16b\n\t"
"rev32 v8.16b, v8.16b\n\t"
"rev64 v8.4s, v8.4s\n\t"
"eor v1.16b, v8.16b, v4.16b\n\t"
"adds x10, x10, #1\n\t"
"adc x11, x11, xzr\n\t"
"mov v8.d[1], x10\n\t"
"mov v8.d[0], x11\n\t"
"rev64 v8.16b, v8.16b\n\t"
"rev32 v8.16b, v8.16b\n\t"
"rev64 v8.4s, v8.4s\n\t"
"eor v2.16b, v8.16b, v4.16b\n\t"
"adds x10, x10, #1\n\t"
"adc x11, x11, xzr\n\t"
"mov v8.d[1], x10\n\t"
"mov v8.d[0], x11\n\t"
"rev64 v8.16b, v8.16b\n\t"
"rev32 v8.16b, v8.16b\n\t"
"rev64 v8.4s, v8.4s\n\t"
"eor v3.16b, v8.16b, v4.16b\n\t"
"adds x10, x10, #1\n\t"
"adc x11, x11, xzr\n\t"
"mov v8.d[1], x10\n\t"
"mov v8.d[0], x11\n\t"
"rev64 v8.16b, v8.16b\n\t"
"rev32 v8.16b, v8.16b\n\t"
"rev64 v8.4s, v8.4s\n\t"
"sub w8, %w[nr], #2\n\t"
"\n"
"L_AES_CTR_encrypt_NEON_loop_nr_4_%=:\n\t"
@@ -45217,8 +45212,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
"b.ge L_AES_CTR_encrypt_NEON_loop_4_%=\n\t"
"mov v2.d[1], x10\n\t"
"mov v2.d[0], x11\n\t"
"rev64 v2.16b, v2.16b\n\t"
"rev32 v2.16b, v2.16b\n\t"
"rev64 v2.4s, v2.4s\n\t"
"\n"
"L_AES_CTR_encrypt_NEON_start_2_%=:\n\t"
"movi v12.16b, #0x40\n\t"
@@ -45238,15 +45232,13 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
"adc x11, x11, xzr\n\t"
"mov v2.d[1], x10\n\t"
"mov v2.d[0], x11\n\t"
"rev64 v2.16b, v2.16b\n\t"
"rev32 v2.16b, v2.16b\n\t"
"rev64 v2.4s, v2.4s\n\t"
"eor v1.16b, v2.16b, v4.16b\n\t"
"adds x10, x10, #1\n\t"
"adc x11, x11, xzr\n\t"
"mov v2.d[1], x10\n\t"
"mov v2.d[0], x11\n\t"
"rev64 v2.16b, v2.16b\n\t"
"rev32 v2.16b, v2.16b\n\t"
"rev64 v2.4s, v2.4s\n\t"
"sub w8, %w[nr], #2\n\t"
"\n"
"L_AES_CTR_encrypt_NEON_loop_nr_2_%=:\n\t"
@@ -45566,8 +45558,7 @@ void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out,
"adc x11, x11, xzr\n\t"
"mov v2.d[1], x10\n\t"
"mov v2.d[0], x11\n\t"
"rev64 v2.16b, v2.16b\n\t"
"rev32 v2.16b, v2.16b\n\t"
"rev64 v2.4s, v2.4s\n\t"
"\n"
"L_AES_CTR_encrypt_NEON_data_done_%=:\n\t"
"rev32 v2.16b, v2.16b\n\t"
@@ -49928,8 +49919,7 @@ void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i,
"mov x17, #0x87\n\t"
"ands w19, %w[sz], #15\n\t"
"cset w16, ne\n\t"
"lsl w16, w16, #4\n\t"
"sub %w[sz], %w[sz], w16\n\t"
"sub %w[sz], %w[sz], w16, lsl 4\n\t"
"ld1 {v2.2d}, [%x[i]]\n\t"
"ld1 {v4.2d}, [%x[key2]]\n\t"
"rev32 v2.16b, v2.16b\n\t"
@@ -51770,11 +51760,10 @@ void AES_set_encrypt_key(const unsigned char* key, word32 len,
"stp w6, w7, [%x[ks]]\n\t"
"stnp w8, w9, [%x[ks], #8]\n\t"
"sub %x[ks], %x[ks], #16\n\t"
"mov w3, w9\n\t"
"ubfx w6, w3, #8, #8\n\t"
"ubfx w7, w3, #16, #8\n\t"
"ubfx w8, w3, #24, #8\n\t"
"ubfx w3, w3, #0, #8\n\t"
"ubfx w6, w9, #8, #8\n\t"
"ubfx w7, w9, #16, #8\n\t"
"ubfx w8, w9, #24, #8\n\t"
"ubfx w3, w9, #0, #8\n\t"
"lsl w6, w6, #2\n\t"
"lsl w7, w7, #2\n\t"
"lsl w8, w8, #2\n\t"
@@ -55552,8 +55541,7 @@ void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i,
__asm__ __volatile__ (
"ands w11, %w[sz], #15\n\t"
"cset w11, ne\n\t"
"lsl w11, w11, #4\n\t"
"sub %w[sz], %w[sz], w11\n\t"
"sub %w[sz], %w[sz], w11, lsl 4\n\t"
"mov x11, #0x87\n\t"
"mov x28, %x[key2]\n\t"
"ldp x23, x24, [%x[i]]\n\t"
+1 -2
View File
@@ -1101,8 +1101,7 @@ L_chacha_use_over_arm64_byte_loop:
eor w5, w5, w4
subs x3, x3, #1
strb w5, [x1], #1
beq L_chacha_use_over_arm64_done
b L_chacha_use_over_arm64_byte_loop
bne L_chacha_use_over_arm64_byte_loop
L_chacha_use_over_arm64_done:
ret
#ifndef __APPLE__
+1 -2
View File
@@ -1024,8 +1024,7 @@ void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
"eor w5, w5, w4\n\t"
"subs %w[len], %w[len], #1\n\t"
"strb w5, [%x[output]], #1\n\t"
"b.eq L_chacha_use_over_arm64_done_%=\n\t"
"b L_chacha_use_over_arm64_byte_loop_%=\n\t"
"b.ne L_chacha_use_over_arm64_byte_loop_%=\n\t"
"\n"
"L_chacha_use_over_arm64_done_%=:\n\t"
: [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)