Merge pull request #9627 from SparkiDev/aarch64_asm_chacha20_256

ChaCha20 Aarch64 ASM fix: 256-bit case fixed
This commit is contained in:
David Garske
2026-01-08 10:24:48 -08:00
committed by GitHub
5 changed files with 597 additions and 590 deletions

View File

@@ -186,3 +186,184 @@ int test_wc_Chacha_Process(void)
return EXPECT_RESULT();
} /* END test_wc_Chacha_Process */
#define CHACHA_LEN 1024
/*
* Testing wc_Chacha_Process()
*/
int test_wc_Chacha_Process_Chunking(void)
{
EXPECT_DECLS;
#ifdef HAVE_CHACHA
ChaCha enc;
WC_DECLARE_VAR(plain, byte, CHACHA_LEN, NULL);
WC_DECLARE_VAR(cipher, byte, CHACHA_LEN, NULL);
byte key[CHACHA_MAX_KEY_SZ];
byte iv[CHACHA_IV_BYTES];
int i;
int cnt;
int sz;
const byte expected[CHACHA_LEN] = {
0xbc, 0xf5, 0x3b, 0xf2, 0x75, 0x85, 0x9e, 0x0a,
0x09, 0x58, 0x83, 0x50, 0x33, 0x12, 0x01, 0xa1,
0xb4, 0xaf, 0x8a, 0xe8, 0x4d, 0x3d, 0xa5, 0x68,
0xf7, 0x6d, 0x3e, 0xe0, 0x62, 0x7e, 0x62, 0x66,
0xdd, 0x07, 0xe9, 0x36, 0x6f, 0x4d, 0xe9, 0x7a,
0x16, 0x48, 0xa1, 0x83, 0x9e, 0x67, 0x4d, 0xa3,
0xfe, 0x7e, 0x4a, 0x31, 0xdd, 0xb6, 0x50, 0x39,
0xd2, 0x2b, 0x93, 0xf2, 0x4d, 0x51, 0x44, 0x42,
0x5d, 0xf1, 0xd9, 0x24, 0xd7, 0xef, 0x4b, 0xa4,
0xfd, 0x6a, 0x53, 0xa5, 0x1e, 0x4a, 0xc8, 0x68,
0x11, 0x69, 0xc6, 0xbd, 0xe1, 0x59, 0xe4, 0xca,
0x5b, 0xa9, 0x77, 0xfe, 0x4f, 0x82, 0x9f, 0xcf,
0x55, 0x16, 0x3c, 0xd5, 0x83, 0xee, 0xc7, 0x53,
0xaf, 0xca, 0x8a, 0xe2, 0xcf, 0xf1, 0x4b, 0x3b,
0x44, 0xf6, 0xc9, 0x6c, 0x5b, 0xd3, 0x28, 0x8a,
0x7e, 0x67, 0xaa, 0x9e, 0xad, 0xce, 0x96, 0xc4,
0x6e, 0x95, 0x8c, 0xf8, 0xf6, 0xb6, 0x42, 0x8e,
0xe7, 0xab, 0xc8, 0x2c, 0x66, 0x8b, 0x80, 0xcf,
0x78, 0xfe, 0x35, 0x8b, 0x59, 0x18, 0x45, 0xcb,
0x18, 0xd4, 0x09, 0x88, 0xa9, 0xf9, 0x27, 0xd1,
0x3b, 0x9d, 0x2b, 0xff, 0x89, 0x21, 0xb0, 0xd2,
0xa7, 0x7e, 0x35, 0x61, 0xae, 0x1c, 0xc3, 0x1c,
0x07, 0x5c, 0x10, 0x5d, 0x71, 0x3a, 0x3a, 0xe8,
0x4c, 0xba, 0x00, 0xde, 0xd1, 0xf9, 0xa1, 0xae,
0x7b, 0x91, 0x9d, 0x66, 0x31, 0x18, 0x55, 0x39,
0xec, 0x1d, 0x83, 0x85, 0x1e, 0x5b, 0x35, 0x17,
0x2e, 0xbc, 0x7a, 0x22, 0x79, 0x09, 0xa7, 0x02,
0xf7, 0x3b, 0x93, 0x2c, 0x89, 0x1b, 0x69, 0xde,
0x80, 0xc8, 0xdf, 0xce, 0xf9, 0xcd, 0xc8, 0x58,
0xd6, 0x4b, 0x65, 0x9a, 0xc4, 0x4f, 0x27, 0xdb,
0x9a, 0x6c, 0x3a, 0xef, 0x20, 0x0b, 0x00, 0x5c,
0x9f, 0x91, 0xc1, 0xf6, 0x80, 0x53, 0x6c, 0x42,
0xe3, 0xd0, 0xfb, 0x3b, 0x23, 0x75, 0x45, 0xa7,
0x5b, 0x9b, 0xaa, 0xcd, 0x1e, 0x03, 0x35, 0x68,
0x17, 0xee, 0xff, 0xd7, 0x4f, 0x77, 0x2f, 0xd0,
0x1d, 0x5e, 0x89, 0x16, 0x50, 0x6f, 0x22, 0x44,
0x10, 0x64, 0x37, 0x66, 0x70, 0x7f, 0x4d, 0x58,
0x36, 0xec, 0x56, 0x4e, 0xfd, 0x22, 0x8d, 0x77,
0xb1, 0x37, 0x07, 0x13, 0xdf, 0x34, 0x40, 0x1c,
0x65, 0x95, 0x9b, 0xb9, 0xac, 0x11, 0xfe, 0x7a,
0xae, 0x1f, 0x17, 0x94, 0xd4, 0xdd, 0x5b, 0x4f,
0x69, 0xa8, 0x04, 0x8e, 0x80, 0x87, 0x7d, 0x96,
0x25, 0x37, 0x83, 0x0e, 0xca, 0xa4, 0xb3, 0x29,
0x2f, 0x4b, 0x83, 0xa4, 0x01, 0x36, 0x0d, 0xdb,
0xd7, 0x6e, 0x7a, 0x9c, 0x3e, 0x82, 0xc8, 0x5f,
0x4e, 0xc6, 0xd2, 0x97, 0x64, 0xe6, 0xd9, 0x50,
0x89, 0xcb, 0x64, 0x33, 0x28, 0x9c, 0x14, 0xf9,
0x41, 0x33, 0x99, 0x0c, 0x87, 0x6f, 0x00, 0x3f,
0x00, 0x6f, 0xae, 0xe9, 0x20, 0xc2, 0xcd, 0xb8,
0x7a, 0x58, 0xde, 0x57, 0x34, 0xda, 0x63, 0xa1,
0x0b, 0x55, 0xfc, 0x54, 0x2a, 0xed, 0xc0, 0xbc,
0x29, 0x5f, 0x88, 0x7d, 0x37, 0x3b, 0x48, 0x86,
0x3f, 0x88, 0xa2, 0xef, 0x55, 0xe6, 0xc4, 0xf8,
0xb8, 0x11, 0x9e, 0x3a, 0x45, 0x79, 0xac, 0x85,
0xb2, 0x70, 0x40, 0xd0, 0x66, 0xe7, 0x66, 0xc8,
0x8e, 0x8f, 0xde, 0xde, 0xf8, 0x50, 0x79, 0x9e,
0x37, 0x04, 0x07, 0x83, 0x5b, 0xe0, 0x68, 0x5b,
0x32, 0xbc, 0x6e, 0x50, 0x05, 0xca, 0xf8, 0x3b,
0xec, 0x15, 0x13, 0xf8, 0x9a, 0xa2, 0x58, 0x98,
0x03, 0x29, 0x83, 0x7f, 0x11, 0xb4, 0x98, 0x41,
0xc1, 0xd9, 0x02, 0x6e, 0x2c, 0x45, 0x55, 0xab,
0xff, 0xcf, 0x23, 0x80, 0xf0, 0x82, 0x73, 0xe9,
0xe6, 0x8f, 0x1a, 0xd9, 0x70, 0xd6, 0x46, 0x1f,
0xa8, 0xf8, 0xbd, 0x14, 0xd9, 0x50, 0x59, 0x8e,
0x46, 0xbf, 0xe2, 0x8a, 0x8e, 0xce, 0xe7, 0x81,
0xf4, 0x3a, 0xd9, 0x07, 0xd8, 0x1d, 0x29, 0x19,
0xc1, 0x9d, 0xac, 0x6f, 0xfb, 0xce, 0x95, 0x03,
0x29, 0xce, 0x4a, 0x60, 0x34, 0x6a, 0x88, 0xc7,
0x5e, 0x8c, 0x71, 0x29, 0x81, 0x64, 0x2f, 0xfb,
0xb4, 0x20, 0x08, 0x57, 0xba, 0x50, 0x75, 0x7b,
0x1e, 0xfa, 0xcc, 0x60, 0xe7, 0x09, 0xab, 0x4e,
0x46, 0x64, 0xfe, 0x17, 0x00, 0x84, 0x8b, 0xca,
0xa8, 0xcb, 0x18, 0x5b, 0xa2, 0x04, 0x13, 0x68,
0x99, 0x02, 0xaf, 0xcb, 0x75, 0xcb, 0x46, 0x61,
0x66, 0x05, 0xd9, 0x5c, 0x6d, 0x8c, 0xf9, 0x8a,
0x57, 0xde, 0xf4, 0xb9, 0x5d, 0x51, 0x17, 0x4a,
0x8c, 0x42, 0xca, 0x0d, 0x7f, 0x92, 0x69, 0x0d,
0x88, 0x2b, 0xc6, 0xee, 0xbd, 0x5a, 0x32, 0x17,
0x84, 0xef, 0xf9, 0xd9, 0x51, 0x33, 0x57, 0x2f,
0x87, 0xf8, 0xda, 0x3c, 0x3c, 0x14, 0xa9, 0x26,
0xad, 0x19, 0xfd, 0x14, 0x5e, 0x33, 0x92, 0xb1,
0xe1, 0xd7, 0xfb, 0x1e, 0x55, 0x40, 0xe5, 0x80,
0x9b, 0x8e, 0x4b, 0x88, 0x58, 0x77, 0xa9, 0xd2,
0xbf, 0x40, 0x90, 0xbe, 0x8f, 0x1f, 0xa7, 0x8a,
0xaf, 0x8e, 0x03, 0x93, 0x4d, 0x8a, 0x73, 0x8e,
0x76, 0x67, 0x43, 0x37, 0xc1, 0x76, 0x87, 0x50,
0x37, 0xc4, 0x02, 0x4a, 0x53, 0x1a, 0x5b, 0xe8,
0x5f, 0xc8, 0x28, 0xad, 0xd3, 0x8a, 0x97, 0x53,
0xa3, 0xf6, 0x48, 0xba, 0x05, 0x18, 0x56, 0x90,
0xa9, 0x95, 0xd8, 0xac, 0xe9, 0xd5, 0x6c, 0xe3,
0x1f, 0xd8, 0xfc, 0xc5, 0x27, 0x19, 0xab, 0x4a,
0xc4, 0x36, 0xc9, 0xe9, 0xaa, 0x30, 0xef, 0x8e,
0x9e, 0x01, 0x18, 0x68, 0xe9, 0x06, 0xf8, 0x54,
0xe5, 0xe2, 0xec, 0xde, 0x52, 0xfc, 0x3b, 0xdd,
0xe9, 0xc7, 0xc8, 0x2b, 0x93, 0xd4, 0xdb, 0x28,
0x72, 0x06, 0x07, 0xd1, 0xba, 0x05, 0x23, 0xa6,
0x41, 0x42, 0x55, 0x6a, 0x6e, 0x6f, 0x6c, 0x40,
0x6a, 0x19, 0xa4, 0xd5, 0xa2, 0x11, 0xb5, 0x2b,
0x16, 0x4a, 0xe3, 0x41, 0xf3, 0xaf, 0x93, 0xbd,
0xc8, 0xd9, 0x26, 0x43, 0x71, 0x56, 0xd2, 0x5e,
0xf5, 0xa8, 0x3c, 0x64, 0x83, 0x04, 0x89, 0x62,
0x20, 0xd3, 0xe9, 0x8e, 0x60, 0xcd, 0xec, 0xd9,
0xce, 0x89, 0xf0, 0x5c, 0xf2, 0x26, 0x72, 0x51,
0xd5, 0x16, 0x7b, 0xef, 0x19, 0x10, 0xb4, 0xce,
0x60, 0x47, 0xab, 0x98, 0x86, 0xbd, 0x39, 0xb7,
0xc9, 0x29, 0x38, 0x1a, 0xc1, 0x5c, 0xab, 0x77,
0xea, 0xe9, 0xf4, 0x7f, 0x6a, 0x06, 0xf7, 0xc0,
0x0b, 0x17, 0x1f, 0x2f, 0xce, 0x07, 0x1b, 0x33,
0x68, 0x4d, 0x64, 0x6a, 0x28, 0x6d, 0x1d, 0xc6,
0x54, 0x5c, 0xa2, 0x69, 0xf9, 0xb4, 0x62, 0xc9,
0x71, 0xf5, 0xd1, 0xb7, 0x7b, 0x02, 0x81, 0x6d,
0x4b, 0x1f, 0x62, 0xc5, 0xce, 0x2e, 0xc6, 0x2a,
0x1d, 0x6f, 0xc7, 0xc1, 0x99, 0x48, 0x7b, 0xc7,
0xf3, 0x53, 0xb7, 0x02, 0x7f, 0x82, 0xda, 0xfa,
0xce, 0xd3, 0x54, 0xf8, 0x9b, 0x30, 0x6f, 0xed,
0x6c, 0xec, 0x1c, 0x21, 0x49, 0x04, 0x51, 0xae,
0xd0, 0x3f, 0xb1, 0xfb, 0x78, 0x1a, 0x6f, 0x35,
0xc8, 0x3f, 0x4c, 0x43, 0x71, 0xe9, 0xb8, 0xd7,
0x74, 0xca, 0x46, 0x68, 0xeb, 0xd9, 0xa3, 0x94,
0x6e, 0x9d, 0xea, 0x57, 0x22, 0x1e, 0x15, 0x27,
0x40, 0xd4, 0x0c, 0x32, 0x40, 0xc0, 0x40, 0x8a,
0x1e, 0x2e, 0x1a, 0x58, 0x84, 0xa0, 0xc3, 0x68,
0x96, 0xfe, 0xb0, 0x96, 0x6c, 0x04, 0x61, 0x35,
0x4a, 0x78, 0xc5, 0xeb, 0x50, 0xca, 0xcb, 0x22,
0x7b, 0x53, 0x02, 0xfa, 0x63, 0x28, 0x10, 0x68,
0x77, 0xab, 0xda, 0x7d, 0xd1, 0xc2, 0x3f, 0x95,
0xa6, 0x5a, 0x92, 0x56, 0xb3, 0xb0, 0x29, 0x7e,
0x0c, 0xb3, 0xc9, 0x39, 0x0f, 0x1f, 0x51, 0x9d
};
WC_ALLOC_VAR(plain, byte, CHACHA_LEN, NULL);
WC_ALLOC_VAR(cipher, byte, CHACHA_LEN, NULL);
XMEMSET(plain, 0xa5, CHACHA_LEN);
for (i = 0; i < (int)sizeof(key); i++) {
key[i] = (byte)i;
}
for (i = 0; i < (int)sizeof(iv); i++) {
iv[i] = (byte)(i + 0x40);
}
for (sz = 1; sz < CHACHA_LEN; sz++) {
ExpectIntEQ(wc_Chacha_SetKey(&enc, key, (word32)sizeof(key)), 0);
ExpectIntEQ(wc_Chacha_SetIV(&enc, iv, 0), 0);
for (cnt = 0; cnt + sz <= CHACHA_LEN; cnt += sz) {
ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt, sz),
0);
}
if (cnt < CHACHA_LEN) {
ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt,
CHACHA_LEN - cnt), 0);
}
ExpectBufEQ(cipher, expected, (int)sizeof(expected));
}
WC_FREE_VAR(plain, NULL);
WC_FREE_VAR(cipher, NULL);
#endif
return EXPECT_RESULT();
} /* END test_wc_Chacha_Process */

View File

@@ -26,9 +26,11 @@
int test_wc_Chacha_SetKey(void);
int test_wc_Chacha_Process(void);
int test_wc_Chacha_Process_Chunking(void);
#define TEST_CHACHA_DECLS \
TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process)
#define TEST_CHACHA_DECLS \
TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process), \
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process_Chunking)
#endif /* WOLFCRYPT_TEST_CHACHA_H */

View File

@@ -493,321 +493,233 @@ L_chacha_crypt_bytes_arm64_round_start_320:
L_chacha_crypt_bytes_arm64_lt_320:
cmp x3, #0x100
blt L_chacha_crypt_bytes_arm64_lt_256
# Move state into regular register
mov x8, v16.d[0]
mov x10, v16.d[1]
mov x12, v17.d[0]
mov x14, v17.d[1]
mov x16, v18.d[0]
mov x19, v18.d[1]
mov x21, v19.d[0]
mov x23, v19.d[1]
# Move state into vector registers
mov v0.16b, v16.16b
mov v1.16b, v17.16b
lsr x9, x8, #32
mov v2.16b, v18.16b
add w5, w21, #1
mov v3.16b, v19.16b
lsr x11, x10, #32
mov v4.16b, v16.16b
mov v5.16b, v17.16b
lsr x13, x12, #32
mov v6.16b, v18.16b
add w6, w21, #2
mov v7.16b, v19.16b
lsr x15, x14, #32
mov v8.16b, v16.16b
mov v9.16b, v17.16b
lsr x17, x16, #32
mov v10.16b, v18.16b
add w21, w21, #3
mov v11.16b, v19.16b
lsr x20, x19, #32
mov v7.s[0], w5
lsr x22, x21, #32
mov v11.s[0], w6
lsr x24, x23, #32
add w7, w21, #1
dup v0.4s, v16.s[0]
dup v1.4s, v16.s[1]
dup v2.4s, v16.s[2]
dup v3.4s, v16.s[3]
dup v4.4s, v17.s[0]
dup v5.4s, v17.s[1]
dup v6.4s, v17.s[2]
dup v7.4s, v17.s[3]
dup v8.4s, v18.s[0]
dup v9.4s, v18.s[1]
dup v10.4s, v18.s[2]
dup v11.4s, v18.s[3]
dup v12.4s, v19.s[0]
dup v13.4s, v19.s[1]
dup v14.4s, v19.s[2]
dup v15.4s, v19.s[3]
# Add to counter word
add v12.4s, v12.4s, v28.4s
# Set number of odd+even rounds to perform
mov x26, #10
L_chacha_crypt_bytes_arm64_round_start_256:
subs x26, x26, #1
# Round odd
# a += b; d ^= a; d <<<= 16;
add v0.4s, v0.4s, v1.4s
add w8, w8, w12
add v4.4s, v4.4s, v5.4s
add w9, w9, w13
add v8.4s, v8.4s, v9.4s
add w10, w10, w14
eor v3.16b, v3.16b, v0.16b
add w11, w11, w15
eor v7.16b, v7.16b, v4.16b
eor w21, w21, w8
eor v11.16b, v11.16b, v8.16b
eor w22, w22, w9
rev32 v3.8h, v3.8h
eor w23, w23, w10
rev32 v7.8h, v7.8h
eor w24, w24, w11
rev32 v11.8h, v11.8h
ror w21, w21, #16
add v0.4s, v0.4s, v4.4s
add v1.4s, v1.4s, v5.4s
add v2.4s, v2.4s, v6.4s
add v3.4s, v3.4s, v7.4s
eor v12.16b, v12.16b, v0.16b
eor v13.16b, v13.16b, v1.16b
eor v14.16b, v14.16b, v2.16b
eor v15.16b, v15.16b, v3.16b
rev32 v12.8h, v12.8h
rev32 v13.8h, v13.8h
rev32 v14.8h, v14.8h
rev32 v15.8h, v15.8h
# c += d; b ^= c; b <<<= 12;
add v2.4s, v2.4s, v3.4s
ror w22, w22, #16
add v6.4s, v6.4s, v7.4s
ror w23, w23, #16
add v10.4s, v10.4s, v11.4s
ror w24, w24, #16
eor v20.16b, v1.16b, v2.16b
add w16, w16, w21
eor v21.16b, v5.16b, v6.16b
add w17, w17, w22
eor v22.16b, v9.16b, v10.16b
add w19, w19, w23
shl v1.4s, v20.4s, #12
add w20, w20, w24
add v8.4s, v8.4s, v12.4s
add v9.4s, v9.4s, v13.4s
add v10.4s, v10.4s, v14.4s
add v11.4s, v11.4s, v15.4s
eor v20.16b, v4.16b, v8.16b
eor v21.16b, v5.16b, v9.16b
eor v22.16b, v6.16b, v10.16b
eor v23.16b, v7.16b, v11.16b
shl v4.4s, v20.4s, #12
shl v5.4s, v21.4s, #12
eor w12, w12, w16
shl v9.4s, v22.4s, #12
eor w13, w13, w17
sri v1.4s, v20.4s, #20
eor w14, w14, w19
shl v6.4s, v22.4s, #12
shl v7.4s, v23.4s, #12
sri v4.4s, v20.4s, #20
sri v5.4s, v21.4s, #20
eor w15, w15, w20
sri v9.4s, v22.4s, #20
ror w12, w12, #20
sri v6.4s, v22.4s, #20
sri v7.4s, v23.4s, #20
# a += b; d ^= a; d <<<= 8;
add v0.4s, v0.4s, v1.4s
ror w13, w13, #20
add v4.4s, v4.4s, v5.4s
ror w14, w14, #20
add v8.4s, v8.4s, v9.4s
ror w15, w15, #20
eor v3.16b, v3.16b, v0.16b
add w8, w8, w12
eor v7.16b, v7.16b, v4.16b
add w9, w9, w13
eor v11.16b, v11.16b, v8.16b
add w10, w10, w14
tbl v3.16b, {v3.16b}, v30.16b
add w11, w11, w15
tbl v7.16b, {v7.16b}, v30.16b
eor w21, w21, w8
tbl v11.16b, {v11.16b}, v30.16b
eor w22, w22, w9
add v0.4s, v0.4s, v4.4s
add v1.4s, v1.4s, v5.4s
add v2.4s, v2.4s, v6.4s
add v3.4s, v3.4s, v7.4s
eor v12.16b, v12.16b, v0.16b
eor v13.16b, v13.16b, v1.16b
eor v14.16b, v14.16b, v2.16b
eor v15.16b, v15.16b, v3.16b
tbl v12.16b, {v12.16b}, v30.16b
tbl v13.16b, {v13.16b}, v30.16b
tbl v14.16b, {v14.16b}, v30.16b
tbl v15.16b, {v15.16b}, v30.16b
# c += d; b ^= c; b <<<= 7;
add v2.4s, v2.4s, v3.4s
eor w23, w23, w10
add v6.4s, v6.4s, v7.4s
eor w24, w24, w11
add v10.4s, v10.4s, v11.4s
ror w21, w21, #24
eor v20.16b, v1.16b, v2.16b
ror w22, w22, #24
eor v21.16b, v5.16b, v6.16b
ror w23, w23, #24
eor v22.16b, v9.16b, v10.16b
ror w24, w24, #24
shl v1.4s, v20.4s, #7
add w16, w16, w21
add v8.4s, v8.4s, v12.4s
add v9.4s, v9.4s, v13.4s
add v10.4s, v10.4s, v14.4s
add v11.4s, v11.4s, v15.4s
eor v20.16b, v4.16b, v8.16b
eor v21.16b, v5.16b, v9.16b
eor v22.16b, v6.16b, v10.16b
eor v23.16b, v7.16b, v11.16b
shl v4.4s, v20.4s, #7
shl v5.4s, v21.4s, #7
add w17, w17, w22
shl v9.4s, v22.4s, #7
add w19, w19, w23
sri v1.4s, v20.4s, #25
add w20, w20, w24
shl v6.4s, v22.4s, #7
shl v7.4s, v23.4s, #7
sri v4.4s, v20.4s, #25
sri v5.4s, v21.4s, #25
eor w12, w12, w16
sri v9.4s, v22.4s, #25
eor w13, w13, w17
ext v3.16b, v3.16b, v3.16b, #12
eor w14, w14, w19
ext v7.16b, v7.16b, v7.16b, #12
eor w15, w15, w20
ext v11.16b, v11.16b, v11.16b, #12
ror w12, w12, #25
ext v1.16b, v1.16b, v1.16b, #4
ror w13, w13, #25
ext v5.16b, v5.16b, v5.16b, #4
ror w14, w14, #25
ext v9.16b, v9.16b, v9.16b, #4
ror w15, w15, #25
ext v2.16b, v2.16b, v2.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
ext v10.16b, v10.16b, v10.16b, #8
sri v6.4s, v22.4s, #25
sri v7.4s, v23.4s, #25
# Round even
# a += b; d ^= a; d <<<= 16;
add v0.4s, v0.4s, v1.4s
add w8, w8, w13
add v4.4s, v4.4s, v5.4s
add w9, w9, w14
add v8.4s, v8.4s, v9.4s
add w10, w10, w15
eor v3.16b, v3.16b, v0.16b
add w11, w11, w12
eor v7.16b, v7.16b, v4.16b
eor w24, w24, w8
eor v11.16b, v11.16b, v8.16b
eor w21, w21, w9
rev32 v3.8h, v3.8h
eor w22, w22, w10
rev32 v7.8h, v7.8h
eor w23, w23, w11
rev32 v11.8h, v11.8h
ror w24, w24, #16
add v0.4s, v0.4s, v5.4s
add v1.4s, v1.4s, v6.4s
add v2.4s, v2.4s, v7.4s
add v3.4s, v3.4s, v4.4s
eor v15.16b, v15.16b, v0.16b
eor v12.16b, v12.16b, v1.16b
eor v13.16b, v13.16b, v2.16b
eor v14.16b, v14.16b, v3.16b
rev32 v15.8h, v15.8h
rev32 v12.8h, v12.8h
rev32 v13.8h, v13.8h
rev32 v14.8h, v14.8h
# c += d; b ^= c; b <<<= 12;
add v2.4s, v2.4s, v3.4s
ror w21, w21, #16
add v6.4s, v6.4s, v7.4s
ror w22, w22, #16
add v10.4s, v10.4s, v11.4s
ror w23, w23, #16
eor v20.16b, v1.16b, v2.16b
add w19, w19, w24
eor v21.16b, v5.16b, v6.16b
add w20, w20, w21
eor v22.16b, v9.16b, v10.16b
add w16, w16, w22
shl v1.4s, v20.4s, #12
add w17, w17, w23
shl v5.4s, v21.4s, #12
eor w13, w13, w19
shl v9.4s, v22.4s, #12
eor w14, w14, w20
sri v1.4s, v20.4s, #20
eor w15, w15, w16
sri v5.4s, v21.4s, #20
eor w12, w12, w17
sri v9.4s, v22.4s, #20
ror w13, w13, #20
add v10.4s, v10.4s, v15.4s
add v11.4s, v11.4s, v12.4s
add v8.4s, v8.4s, v13.4s
add v9.4s, v9.4s, v14.4s
eor v20.16b, v5.16b, v10.16b
eor v21.16b, v6.16b, v11.16b
eor v22.16b, v7.16b, v8.16b
eor v23.16b, v4.16b, v9.16b
shl v5.4s, v20.4s, #12
shl v6.4s, v21.4s, #12
shl v7.4s, v22.4s, #12
shl v4.4s, v23.4s, #12
sri v5.4s, v20.4s, #20
sri v6.4s, v21.4s, #20
sri v7.4s, v22.4s, #20
sri v4.4s, v23.4s, #20
# a += b; d ^= a; d <<<= 8;
add v0.4s, v0.4s, v1.4s
ror w14, w14, #20
add v4.4s, v4.4s, v5.4s
ror w15, w15, #20
add v8.4s, v8.4s, v9.4s
ror w12, w12, #20
eor v3.16b, v3.16b, v0.16b
add w8, w8, w13
eor v7.16b, v7.16b, v4.16b
add w9, w9, w14
eor v11.16b, v11.16b, v8.16b
add w10, w10, w15
tbl v3.16b, {v3.16b}, v30.16b
add w11, w11, w12
tbl v7.16b, {v7.16b}, v30.16b
eor w24, w24, w8
tbl v11.16b, {v11.16b}, v30.16b
eor w21, w21, w9
add v0.4s, v0.4s, v5.4s
add v1.4s, v1.4s, v6.4s
add v2.4s, v2.4s, v7.4s
add v3.4s, v3.4s, v4.4s
eor v15.16b, v15.16b, v0.16b
eor v12.16b, v12.16b, v1.16b
eor v13.16b, v13.16b, v2.16b
eor v14.16b, v14.16b, v3.16b
tbl v15.16b, {v15.16b}, v30.16b
tbl v12.16b, {v12.16b}, v30.16b
tbl v13.16b, {v13.16b}, v30.16b
tbl v14.16b, {v14.16b}, v30.16b
# c += d; b ^= c; b <<<= 7;
add v2.4s, v2.4s, v3.4s
eor w22, w22, w10
add v6.4s, v6.4s, v7.4s
eor w23, w23, w11
add v10.4s, v10.4s, v11.4s
ror w24, w24, #24
eor v20.16b, v1.16b, v2.16b
ror w21, w21, #24
eor v21.16b, v5.16b, v6.16b
ror w22, w22, #24
eor v22.16b, v9.16b, v10.16b
ror w23, w23, #24
shl v1.4s, v20.4s, #7
add w19, w19, w24
shl v5.4s, v21.4s, #7
add w20, w20, w21
shl v9.4s, v22.4s, #7
add w16, w16, w22
sri v1.4s, v20.4s, #25
add w17, w17, w23
sri v5.4s, v21.4s, #25
eor w13, w13, w19
sri v9.4s, v22.4s, #25
eor w14, w14, w20
ext v3.16b, v3.16b, v3.16b, #4
eor w15, w15, w16
ext v7.16b, v7.16b, v7.16b, #4
eor w12, w12, w17
ext v11.16b, v11.16b, v11.16b, #4
ror w13, w13, #25
ext v1.16b, v1.16b, v1.16b, #12
ror w14, w14, #25
ext v5.16b, v5.16b, v5.16b, #12
ror w15, w15, #25
ext v9.16b, v9.16b, v9.16b, #12
ror w12, w12, #25
ext v2.16b, v2.16b, v2.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
ext v10.16b, v10.16b, v10.16b, #8
add v10.4s, v10.4s, v15.4s
add v11.4s, v11.4s, v12.4s
add v8.4s, v8.4s, v13.4s
add v9.4s, v9.4s, v14.4s
eor v20.16b, v5.16b, v10.16b
eor v21.16b, v6.16b, v11.16b
eor v22.16b, v7.16b, v8.16b
eor v23.16b, v4.16b, v9.16b
shl v5.4s, v20.4s, #7
shl v6.4s, v21.4s, #7
shl v7.4s, v22.4s, #7
shl v4.4s, v23.4s, #7
sri v5.4s, v20.4s, #25
sri v6.4s, v21.4s, #25
sri v7.4s, v22.4s, #25
sri v4.4s, v23.4s, #25
bne L_chacha_crypt_bytes_arm64_round_start_256
mov x26, #4
# Add counter now rather than after transposed
add v12.4s, v12.4s, v28.4s
# Load message
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
# Add one (2 added during calculating vector results)
add w21, w21, #1
# Add back state, XOR msg, store (load next block)
add v0.4s, v0.4s, v16.4s
add v1.4s, v1.4s, v17.4s
add v2.4s, v2.4s, v18.4s
add v3.4s, v3.4s, v19.4s
eor v0.16b, v0.16b, v20.16b
eor v1.16b, v1.16b, v21.16b
eor v2.16b, v2.16b, v22.16b
eor v3.16b, v3.16b, v23.16b
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40
mov v19.s[0], w5
add v4.4s, v4.4s, v16.4s
add v5.4s, v5.4s, v17.4s
add v6.4s, v6.4s, v18.4s
add v7.4s, v7.4s, v19.4s
eor v4.16b, v4.16b, v20.16b
eor v5.16b, v5.16b, v21.16b
eor v6.16b, v6.16b, v22.16b
eor v7.16b, v7.16b, v23.16b
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #0x40
mov v19.s[0], w6
add v8.4s, v8.4s, v16.4s
add v9.4s, v9.4s, v17.4s
add v10.4s, v10.4s, v18.4s
add v11.4s, v11.4s, v19.4s
eor v8.16b, v8.16b, v20.16b
eor v9.16b, v9.16b, v21.16b
eor v10.16b, v10.16b, v22.16b
eor v11.16b, v11.16b, v23.16b
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x1], #0x40
# Move regular registers into vector registers for adding and xor
orr x8, x8, x9, lsl 32
orr x10, x10, x11, lsl 32
orr x12, x12, x13, lsl 32
mov v0.d[0], x8
orr x14, x14, x15, lsl 32
mov v0.d[1], x10
orr x16, x16, x17, lsl 32
mov v1.d[0], x12
orr x19, x19, x20, lsl 32
mov v1.d[1], x14
orr x21, x21, x22, lsl 32
mov v2.d[0], x16
orr x23, x23, x24, lsl 32
mov v2.d[1], x19
mov v3.d[0], x21
mov v3.d[1], x23
# Add back state, XOR in message and store
add v0.4s, v0.4s, v16.4s
add v1.4s, v1.4s, v17.4s
add v2.4s, v2.4s, v18.4s
add v3.4s, v3.4s, v19.4s
eor v0.16b, v0.16b, v20.16b
eor v1.16b, v1.16b, v21.16b
eor v2.16b, v2.16b, v22.16b
eor v3.16b, v3.16b, v23.16b
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40
mov v19.d[0], x7
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
# Transpose vectors
trn1 v20.4s, v0.4s, v1.4s
trn1 v22.4s, v2.4s, v3.4s
trn2 v21.4s, v0.4s, v1.4s
trn2 v23.4s, v2.4s, v3.4s
trn1 v0.2d, v20.2d, v22.2d
trn1 v1.2d, v21.2d, v23.2d
trn2 v2.2d, v20.2d, v22.2d
trn2 v3.2d, v21.2d, v23.2d
trn1 v20.4s, v4.4s, v5.4s
trn1 v22.4s, v6.4s, v7.4s
trn2 v21.4s, v4.4s, v5.4s
trn2 v23.4s, v6.4s, v7.4s
trn1 v4.2d, v20.2d, v22.2d
trn1 v5.2d, v21.2d, v23.2d
trn2 v6.2d, v20.2d, v22.2d
trn2 v7.2d, v21.2d, v23.2d
trn1 v20.4s, v8.4s, v9.4s
trn1 v22.4s, v10.4s, v11.4s
trn2 v21.4s, v8.4s, v9.4s
trn2 v23.4s, v10.4s, v11.4s
trn1 v8.2d, v20.2d, v22.2d
trn1 v9.2d, v21.2d, v23.2d
trn2 v10.2d, v20.2d, v22.2d
trn2 v11.2d, v21.2d, v23.2d
trn1 v20.4s, v12.4s, v13.4s
trn1 v22.4s, v14.4s, v15.4s
trn2 v21.4s, v12.4s, v13.4s
trn2 v23.4s, v14.4s, v15.4s
trn1 v12.2d, v20.2d, v22.2d
trn1 v13.2d, v21.2d, v23.2d
trn2 v14.2d, v20.2d, v22.2d
trn2 v15.2d, v21.2d, v23.2d
# Add back state, XOR in message and store (load next block)
add v20.4s, v0.4s, v16.4s
add v21.4s, v4.4s, v17.4s
add v22.4s, v8.4s, v18.4s
add v23.4s, v12.4s, v19.4s
eor v20.16b, v20.16b, v24.16b
eor v21.16b, v21.16b, v25.16b
eor v22.16b, v22.16b, v26.16b
eor v23.16b, v23.16b, v27.16b
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
add v20.4s, v1.4s, v16.4s
add v21.4s, v5.4s, v17.4s
add v22.4s, v9.4s, v18.4s
add v23.4s, v13.4s, v19.4s
eor v20.16b, v20.16b, v24.16b
eor v21.16b, v21.16b, v25.16b
eor v22.16b, v22.16b, v26.16b
eor v23.16b, v23.16b, v27.16b
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
add v20.4s, v2.4s, v16.4s
add v21.4s, v6.4s, v17.4s
add v22.4s, v10.4s, v18.4s
add v23.4s, v14.4s, v19.4s
eor v20.16b, v20.16b, v24.16b
eor v21.16b, v21.16b, v25.16b
eor v22.16b, v22.16b, v26.16b
eor v23.16b, v23.16b, v27.16b
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
add v20.4s, v3.4s, v16.4s
add v21.4s, v7.4s, v17.4s
add v22.4s, v11.4s, v18.4s
add v23.4s, v15.4s, v19.4s
eor v20.16b, v20.16b, v24.16b
eor v21.16b, v21.16b, v25.16b
eor v22.16b, v22.16b, v26.16b
eor v23.16b, v23.16b, v27.16b
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
mov v29.s[0], w26
sub x3, x3, #0x100
add v19.4s, v19.4s, v29.4s
# Done 256-byte block
L_chacha_crypt_bytes_arm64_lt_256:
cmp x3, #0x80

View File

@@ -439,42 +439,25 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
"L_chacha_crypt_bytes_arm64_lt_320_%=: \n\t"
"cmp %w[len], #0x100\n\t"
"b.lt L_chacha_crypt_bytes_arm64_lt_256_%=\n\t"
/* Move state into regular register */
"mov x8, v16.d[0]\n\t"
"mov x10, v16.d[1]\n\t"
"mov x12, v17.d[0]\n\t"
"mov x14, v17.d[1]\n\t"
"mov x16, v18.d[0]\n\t"
"mov x19, v18.d[1]\n\t"
"mov x21, v19.d[0]\n\t"
"mov x23, v19.d[1]\n\t"
/* Move state into vector registers */
"mov v0.16b, v16.16b\n\t"
"mov v1.16b, v17.16b\n\t"
"lsr x9, x8, #32\n\t"
"mov v2.16b, v18.16b\n\t"
"add %w[rol8], w21, #1\n\t"
"mov v3.16b, v19.16b\n\t"
"lsr x11, x10, #32\n\t"
"mov v4.16b, v16.16b\n\t"
"mov v5.16b, v17.16b\n\t"
"lsr x13, x12, #32\n\t"
"mov v6.16b, v18.16b\n\t"
"add %w[ctr], w21, #2\n\t"
"mov v7.16b, v19.16b\n\t"
"lsr x15, x14, #32\n\t"
"mov v8.16b, v16.16b\n\t"
"mov v9.16b, v17.16b\n\t"
"lsr x17, x16, #32\n\t"
"mov v10.16b, v18.16b\n\t"
"add w21, w21, #3\n\t"
"mov v11.16b, v19.16b\n\t"
"lsr x20, x19, #32\n\t"
"mov v7.s[0], %w[rol8]\n\t"
"lsr x22, x21, #32\n\t"
"mov v11.s[0], %w[ctr]\n\t"
"lsr x24, x23, #32\n\t"
"add w7, w21, #1\n\t"
"dup v0.4s, v16.s[0]\n\t"
"dup v1.4s, v16.s[1]\n\t"
"dup v2.4s, v16.s[2]\n\t"
"dup v3.4s, v16.s[3]\n\t"
"dup v4.4s, v17.s[0]\n\t"
"dup v5.4s, v17.s[1]\n\t"
"dup v6.4s, v17.s[2]\n\t"
"dup v7.4s, v17.s[3]\n\t"
"dup v8.4s, v18.s[0]\n\t"
"dup v9.4s, v18.s[1]\n\t"
"dup v10.4s, v18.s[2]\n\t"
"dup v11.4s, v18.s[3]\n\t"
"dup v12.4s, v19.s[0]\n\t"
"dup v13.4s, v19.s[1]\n\t"
"dup v14.4s, v19.s[2]\n\t"
"dup v15.4s, v19.s[3]\n\t"
/* Add to counter word */
"add v12.4s, v12.4s, v28.4s\n\t"
/* Set number of odd+even rounds to perform */
"mov x26, #10\n\t"
"\n"
@@ -482,279 +465,208 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
"subs x26, x26, #1\n\t"
/* Round odd */
/* a += b; d ^= a; d <<<= 16; */
"add v0.4s, v0.4s, v1.4s\n\t"
"add w8, w8, w12\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"add w9, w9, w13\n\t"
"add v8.4s, v8.4s, v9.4s\n\t"
"add w10, w10, w14\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"add w11, w11, w15\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"eor w21, w21, w8\n\t"
"eor v11.16b, v11.16b, v8.16b\n\t"
"eor w22, w22, w9\n\t"
"rev32 v3.8h, v3.8h\n\t"
"eor w23, w23, w10\n\t"
"rev32 v7.8h, v7.8h\n\t"
"eor w24, w24, w11\n\t"
"rev32 v11.8h, v11.8h\n\t"
"ror w21, w21, #16\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"rev32 v12.8h, v12.8h\n\t"
"rev32 v13.8h, v13.8h\n\t"
"rev32 v14.8h, v14.8h\n\t"
"rev32 v15.8h, v15.8h\n\t"
/* c += d; b ^= c; b <<<= 12; */
"add v2.4s, v2.4s, v3.4s\n\t"
"ror w22, w22, #16\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"ror w23, w23, #16\n\t"
"add v10.4s, v10.4s, v11.4s\n\t"
"ror w24, w24, #16\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"add w16, w16, w21\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"add w17, w17, w22\n\t"
"eor v22.16b, v9.16b, v10.16b\n\t"
"add w19, w19, w23\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"add w20, w20, w24\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"shl v4.4s, v20.4s, #12\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"eor w12, w12, w16\n\t"
"shl v9.4s, v22.4s, #12\n\t"
"eor w13, w13, w17\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"eor w14, w14, w19\n\t"
"shl v6.4s, v22.4s, #12\n\t"
"shl v7.4s, v23.4s, #12\n\t"
"sri v4.4s, v20.4s, #20\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"eor w15, w15, w20\n\t"
"sri v9.4s, v22.4s, #20\n\t"
"ror w12, w12, #20\n\t"
"sri v6.4s, v22.4s, #20\n\t"
"sri v7.4s, v23.4s, #20\n\t"
/* a += b; d ^= a; d <<<= 8; */
"add v0.4s, v0.4s, v1.4s\n\t"
"ror w13, w13, #20\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"ror w14, w14, #20\n\t"
"add v8.4s, v8.4s, v9.4s\n\t"
"ror w15, w15, #20\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"add w8, w8, w12\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"add w9, w9, w13\n\t"
"eor v11.16b, v11.16b, v8.16b\n\t"
"add w10, w10, w14\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"add w11, w11, w15\n\t"
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
"eor w21, w21, w8\n\t"
"tbl v11.16b, {v11.16b}, v30.16b\n\t"
"eor w22, w22, w9\n\t"
"add v0.4s, v0.4s, v4.4s\n\t"
"add v1.4s, v1.4s, v5.4s\n\t"
"add v2.4s, v2.4s, v6.4s\n\t"
"add v3.4s, v3.4s, v7.4s\n\t"
"eor v12.16b, v12.16b, v0.16b\n\t"
"eor v13.16b, v13.16b, v1.16b\n\t"
"eor v14.16b, v14.16b, v2.16b\n\t"
"eor v15.16b, v15.16b, v3.16b\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
/* c += d; b ^= c; b <<<= 7; */
"add v2.4s, v2.4s, v3.4s\n\t"
"eor w23, w23, w10\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor w24, w24, w11\n\t"
"add v10.4s, v10.4s, v11.4s\n\t"
"ror w21, w21, #24\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"ror w22, w22, #24\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"ror w23, w23, #24\n\t"
"eor v22.16b, v9.16b, v10.16b\n\t"
"ror w24, w24, #24\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"add w16, w16, w21\n\t"
"add v8.4s, v8.4s, v12.4s\n\t"
"add v9.4s, v9.4s, v13.4s\n\t"
"add v10.4s, v10.4s, v14.4s\n\t"
"add v11.4s, v11.4s, v15.4s\n\t"
"eor v20.16b, v4.16b, v8.16b\n\t"
"eor v21.16b, v5.16b, v9.16b\n\t"
"eor v22.16b, v6.16b, v10.16b\n\t"
"eor v23.16b, v7.16b, v11.16b\n\t"
"shl v4.4s, v20.4s, #7\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"add w17, w17, w22\n\t"
"shl v9.4s, v22.4s, #7\n\t"
"add w19, w19, w23\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"add w20, w20, w24\n\t"
"shl v6.4s, v22.4s, #7\n\t"
"shl v7.4s, v23.4s, #7\n\t"
"sri v4.4s, v20.4s, #25\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"eor w12, w12, w16\n\t"
"sri v9.4s, v22.4s, #25\n\t"
"eor w13, w13, w17\n\t"
"ext v3.16b, v3.16b, v3.16b, #12\n\t"
"eor w14, w14, w19\n\t"
"ext v7.16b, v7.16b, v7.16b, #12\n\t"
"eor w15, w15, w20\n\t"
"ext v11.16b, v11.16b, v11.16b, #12\n\t"
"ror w12, w12, #25\n\t"
"ext v1.16b, v1.16b, v1.16b, #4\n\t"
"ror w13, w13, #25\n\t"
"ext v5.16b, v5.16b, v5.16b, #4\n\t"
"ror w14, w14, #25\n\t"
"ext v9.16b, v9.16b, v9.16b, #4\n\t"
"ror w15, w15, #25\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
"ext v10.16b, v10.16b, v10.16b, #8\n\t"
"sri v6.4s, v22.4s, #25\n\t"
"sri v7.4s, v23.4s, #25\n\t"
/* Round even */
/* a += b; d ^= a; d <<<= 16; */
"add v0.4s, v0.4s, v1.4s\n\t"
"add w8, w8, w13\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"add w9, w9, w14\n\t"
"add v8.4s, v8.4s, v9.4s\n\t"
"add w10, w10, w15\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"add w11, w11, w12\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"eor w24, w24, w8\n\t"
"eor v11.16b, v11.16b, v8.16b\n\t"
"eor w21, w21, w9\n\t"
"rev32 v3.8h, v3.8h\n\t"
"eor w22, w22, w10\n\t"
"rev32 v7.8h, v7.8h\n\t"
"eor w23, w23, w11\n\t"
"rev32 v11.8h, v11.8h\n\t"
"ror w24, w24, #16\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"rev32 v15.8h, v15.8h\n\t"
"rev32 v12.8h, v12.8h\n\t"
"rev32 v13.8h, v13.8h\n\t"
"rev32 v14.8h, v14.8h\n\t"
/* c += d; b ^= c; b <<<= 12; */
"add v2.4s, v2.4s, v3.4s\n\t"
"ror w21, w21, #16\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"ror w22, w22, #16\n\t"
"add v10.4s, v10.4s, v11.4s\n\t"
"ror w23, w23, #16\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"add w19, w19, w24\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"add w20, w20, w21\n\t"
"eor v22.16b, v9.16b, v10.16b\n\t"
"add w16, w16, w22\n\t"
"shl v1.4s, v20.4s, #12\n\t"
"add w17, w17, w23\n\t"
"shl v5.4s, v21.4s, #12\n\t"
"eor w13, w13, w19\n\t"
"shl v9.4s, v22.4s, #12\n\t"
"eor w14, w14, w20\n\t"
"sri v1.4s, v20.4s, #20\n\t"
"eor w15, w15, w16\n\t"
"sri v5.4s, v21.4s, #20\n\t"
"eor w12, w12, w17\n\t"
"sri v9.4s, v22.4s, #20\n\t"
"ror w13, w13, #20\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"shl v5.4s, v20.4s, #12\n\t"
"shl v6.4s, v21.4s, #12\n\t"
"shl v7.4s, v22.4s, #12\n\t"
"shl v4.4s, v23.4s, #12\n\t"
"sri v5.4s, v20.4s, #20\n\t"
"sri v6.4s, v21.4s, #20\n\t"
"sri v7.4s, v22.4s, #20\n\t"
"sri v4.4s, v23.4s, #20\n\t"
/* a += b; d ^= a; d <<<= 8; */
"add v0.4s, v0.4s, v1.4s\n\t"
"ror w14, w14, #20\n\t"
"add v4.4s, v4.4s, v5.4s\n\t"
"ror w15, w15, #20\n\t"
"add v8.4s, v8.4s, v9.4s\n\t"
"ror w12, w12, #20\n\t"
"eor v3.16b, v3.16b, v0.16b\n\t"
"add w8, w8, w13\n\t"
"eor v7.16b, v7.16b, v4.16b\n\t"
"add w9, w9, w14\n\t"
"eor v11.16b, v11.16b, v8.16b\n\t"
"add w10, w10, w15\n\t"
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
"add w11, w11, w12\n\t"
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
"eor w24, w24, w8\n\t"
"tbl v11.16b, {v11.16b}, v30.16b\n\t"
"eor w21, w21, w9\n\t"
"add v0.4s, v0.4s, v5.4s\n\t"
"add v1.4s, v1.4s, v6.4s\n\t"
"add v2.4s, v2.4s, v7.4s\n\t"
"add v3.4s, v3.4s, v4.4s\n\t"
"eor v15.16b, v15.16b, v0.16b\n\t"
"eor v12.16b, v12.16b, v1.16b\n\t"
"eor v13.16b, v13.16b, v2.16b\n\t"
"eor v14.16b, v14.16b, v3.16b\n\t"
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
/* c += d; b ^= c; b <<<= 7; */
"add v2.4s, v2.4s, v3.4s\n\t"
"eor w22, w22, w10\n\t"
"add v6.4s, v6.4s, v7.4s\n\t"
"eor w23, w23, w11\n\t"
"add v10.4s, v10.4s, v11.4s\n\t"
"ror w24, w24, #24\n\t"
"eor v20.16b, v1.16b, v2.16b\n\t"
"ror w21, w21, #24\n\t"
"eor v21.16b, v5.16b, v6.16b\n\t"
"ror w22, w22, #24\n\t"
"eor v22.16b, v9.16b, v10.16b\n\t"
"ror w23, w23, #24\n\t"
"shl v1.4s, v20.4s, #7\n\t"
"add w19, w19, w24\n\t"
"shl v5.4s, v21.4s, #7\n\t"
"add w20, w20, w21\n\t"
"shl v9.4s, v22.4s, #7\n\t"
"add w16, w16, w22\n\t"
"sri v1.4s, v20.4s, #25\n\t"
"add w17, w17, w23\n\t"
"sri v5.4s, v21.4s, #25\n\t"
"eor w13, w13, w19\n\t"
"sri v9.4s, v22.4s, #25\n\t"
"eor w14, w14, w20\n\t"
"ext v3.16b, v3.16b, v3.16b, #4\n\t"
"eor w15, w15, w16\n\t"
"ext v7.16b, v7.16b, v7.16b, #4\n\t"
"eor w12, w12, w17\n\t"
"ext v11.16b, v11.16b, v11.16b, #4\n\t"
"ror w13, w13, #25\n\t"
"ext v1.16b, v1.16b, v1.16b, #12\n\t"
"ror w14, w14, #25\n\t"
"ext v5.16b, v5.16b, v5.16b, #12\n\t"
"ror w15, w15, #25\n\t"
"ext v9.16b, v9.16b, v9.16b, #12\n\t"
"ror w12, w12, #25\n\t"
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
"ext v10.16b, v10.16b, v10.16b, #8\n\t"
"add v10.4s, v10.4s, v15.4s\n\t"
"add v11.4s, v11.4s, v12.4s\n\t"
"add v8.4s, v8.4s, v13.4s\n\t"
"add v9.4s, v9.4s, v14.4s\n\t"
"eor v20.16b, v5.16b, v10.16b\n\t"
"eor v21.16b, v6.16b, v11.16b\n\t"
"eor v22.16b, v7.16b, v8.16b\n\t"
"eor v23.16b, v4.16b, v9.16b\n\t"
"shl v5.4s, v20.4s, #7\n\t"
"shl v6.4s, v21.4s, #7\n\t"
"shl v7.4s, v22.4s, #7\n\t"
"shl v4.4s, v23.4s, #7\n\t"
"sri v5.4s, v20.4s, #25\n\t"
"sri v6.4s, v21.4s, #25\n\t"
"sri v7.4s, v22.4s, #25\n\t"
"sri v4.4s, v23.4s, #25\n\t"
"b.ne L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t"
"mov x26, #4\n\t"
/* Add counter now rather than after transposed */
"add v12.4s, v12.4s, v28.4s\n\t"
/* Load message */
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
/* Add one (2 added during calculating vector results) */
"add w21, w21, #1\n\t"
/* Add back state, XOR msg, store (load next block) */
"add v0.4s, v0.4s, v16.4s\n\t"
"add v1.4s, v1.4s, v17.4s\n\t"
"add v2.4s, v2.4s, v18.4s\n\t"
"add v3.4s, v3.4s, v19.4s\n\t"
"eor v0.16b, v0.16b, v20.16b\n\t"
"eor v1.16b, v1.16b, v21.16b\n\t"
"eor v2.16b, v2.16b, v22.16b\n\t"
"eor v3.16b, v3.16b, v23.16b\n\t"
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
"mov v19.s[0], %w[rol8]\n\t"
"add v4.4s, v4.4s, v16.4s\n\t"
"add v5.4s, v5.4s, v17.4s\n\t"
"add v6.4s, v6.4s, v18.4s\n\t"
"add v7.4s, v7.4s, v19.4s\n\t"
"eor v4.16b, v4.16b, v20.16b\n\t"
"eor v5.16b, v5.16b, v21.16b\n\t"
"eor v6.16b, v6.16b, v22.16b\n\t"
"eor v7.16b, v7.16b, v23.16b\n\t"
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
"st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%x[c]], #0x40\n\t"
"mov v19.s[0], %w[ctr]\n\t"
"add v8.4s, v8.4s, v16.4s\n\t"
"add v9.4s, v9.4s, v17.4s\n\t"
"add v10.4s, v10.4s, v18.4s\n\t"
"add v11.4s, v11.4s, v19.4s\n\t"
"eor v8.16b, v8.16b, v20.16b\n\t"
"eor v9.16b, v9.16b, v21.16b\n\t"
"eor v10.16b, v10.16b, v22.16b\n\t"
"eor v11.16b, v11.16b, v23.16b\n\t"
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
"st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%x[c]], #0x40\n\t"
/* Move regular registers into vector registers for adding and xor */
"orr x8, x8, x9, lsl 32\n\t"
"orr x10, x10, x11, lsl 32\n\t"
"orr x12, x12, x13, lsl 32\n\t"
"mov v0.d[0], x8\n\t"
"orr x14, x14, x15, lsl 32\n\t"
"mov v0.d[1], x10\n\t"
"orr x16, x16, x17, lsl 32\n\t"
"mov v1.d[0], x12\n\t"
"orr x19, x19, x20, lsl 32\n\t"
"mov v1.d[1], x14\n\t"
"orr x21, x21, x22, lsl 32\n\t"
"mov v2.d[0], x16\n\t"
"orr x23, x23, x24, lsl 32\n\t"
"mov v2.d[1], x19\n\t"
"mov v3.d[0], x21\n\t"
"mov v3.d[1], x23\n\t"
/* Add back state, XOR in message and store */
"add v0.4s, v0.4s, v16.4s\n\t"
"add v1.4s, v1.4s, v17.4s\n\t"
"add v2.4s, v2.4s, v18.4s\n\t"
"add v3.4s, v3.4s, v19.4s\n\t"
"eor v0.16b, v0.16b, v20.16b\n\t"
"eor v1.16b, v1.16b, v21.16b\n\t"
"eor v2.16b, v2.16b, v22.16b\n\t"
"eor v3.16b, v3.16b, v23.16b\n\t"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
"mov v19.d[0], x7\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
/* Transpose vectors */
"trn1 v20.4s, v0.4s, v1.4s\n\t"
"trn1 v22.4s, v2.4s, v3.4s\n\t"
"trn2 v21.4s, v0.4s, v1.4s\n\t"
"trn2 v23.4s, v2.4s, v3.4s\n\t"
"trn1 v0.2d, v20.2d, v22.2d\n\t"
"trn1 v1.2d, v21.2d, v23.2d\n\t"
"trn2 v2.2d, v20.2d, v22.2d\n\t"
"trn2 v3.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v4.4s, v5.4s\n\t"
"trn1 v22.4s, v6.4s, v7.4s\n\t"
"trn2 v21.4s, v4.4s, v5.4s\n\t"
"trn2 v23.4s, v6.4s, v7.4s\n\t"
"trn1 v4.2d, v20.2d, v22.2d\n\t"
"trn1 v5.2d, v21.2d, v23.2d\n\t"
"trn2 v6.2d, v20.2d, v22.2d\n\t"
"trn2 v7.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v8.4s, v9.4s\n\t"
"trn1 v22.4s, v10.4s, v11.4s\n\t"
"trn2 v21.4s, v8.4s, v9.4s\n\t"
"trn2 v23.4s, v10.4s, v11.4s\n\t"
"trn1 v8.2d, v20.2d, v22.2d\n\t"
"trn1 v9.2d, v21.2d, v23.2d\n\t"
"trn2 v10.2d, v20.2d, v22.2d\n\t"
"trn2 v11.2d, v21.2d, v23.2d\n\t"
"trn1 v20.4s, v12.4s, v13.4s\n\t"
"trn1 v22.4s, v14.4s, v15.4s\n\t"
"trn2 v21.4s, v12.4s, v13.4s\n\t"
"trn2 v23.4s, v14.4s, v15.4s\n\t"
"trn1 v12.2d, v20.2d, v22.2d\n\t"
"trn1 v13.2d, v21.2d, v23.2d\n\t"
"trn2 v14.2d, v20.2d, v22.2d\n\t"
"trn2 v15.2d, v21.2d, v23.2d\n\t"
/* Add back state, XOR in message and store (load next block) */
"add v20.4s, v0.4s, v16.4s\n\t"
"add v21.4s, v4.4s, v17.4s\n\t"
"add v22.4s, v8.4s, v18.4s\n\t"
"add v23.4s, v12.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v1.4s, v16.4s\n\t"
"add v21.4s, v5.4s, v17.4s\n\t"
"add v22.4s, v9.4s, v18.4s\n\t"
"add v23.4s, v13.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v2.4s, v16.4s\n\t"
"add v21.4s, v6.4s, v17.4s\n\t"
"add v22.4s, v10.4s, v18.4s\n\t"
"add v23.4s, v14.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"add v20.4s, v3.4s, v16.4s\n\t"
"add v21.4s, v7.4s, v17.4s\n\t"
"add v22.4s, v11.4s, v18.4s\n\t"
"add v23.4s, v15.4s, v19.4s\n\t"
"eor v20.16b, v20.16b, v24.16b\n\t"
"eor v21.16b, v21.16b, v25.16b\n\t"
"eor v22.16b, v22.16b, v26.16b\n\t"
"eor v23.16b, v23.16b, v27.16b\n\t"
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
"mov v29.s[0], w26\n\t"
"sub %w[len], %w[len], #0x100\n\t"
"add v19.4s, v19.4s, v29.4s\n\t"
/* Done 256-byte block */
"\n"
"L_chacha_crypt_bytes_arm64_lt_256_%=: \n\t"

View File

@@ -1980,7 +1980,7 @@ _curve25519_base:
add x2, x2, :lo12:L_curve25519_base_x2
#else
adrp x2, L_curve25519_base_x2@PAGE
add x2, x2, :lo12:L_curve25519_base_x2@PAGEOFF
add x2, x2, L_curve25519_base_x2@PAGEOFF
#endif /* __APPLE__ */
ldp x6, x7, [x2]
ldp x8, x9, [x2, #16]