mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2026-01-26 19:32:20 +01:00
Merge pull request #9627 from SparkiDev/aarch64_asm_chacha20_256
ChaCha20 Aarch64 ASM fix: 256-bit case fixed
This commit is contained in:
@@ -186,3 +186,184 @@ int test_wc_Chacha_Process(void)
|
||||
return EXPECT_RESULT();
|
||||
} /* END test_wc_Chacha_Process */
|
||||
|
||||
|
||||
#define CHACHA_LEN 1024
|
||||
/*
|
||||
* Testing wc_Chacha_Process()
|
||||
*/
|
||||
int test_wc_Chacha_Process_Chunking(void)
|
||||
{
|
||||
EXPECT_DECLS;
|
||||
#ifdef HAVE_CHACHA
|
||||
ChaCha enc;
|
||||
WC_DECLARE_VAR(plain, byte, CHACHA_LEN, NULL);
|
||||
WC_DECLARE_VAR(cipher, byte, CHACHA_LEN, NULL);
|
||||
byte key[CHACHA_MAX_KEY_SZ];
|
||||
byte iv[CHACHA_IV_BYTES];
|
||||
int i;
|
||||
int cnt;
|
||||
int sz;
|
||||
const byte expected[CHACHA_LEN] = {
|
||||
0xbc, 0xf5, 0x3b, 0xf2, 0x75, 0x85, 0x9e, 0x0a,
|
||||
0x09, 0x58, 0x83, 0x50, 0x33, 0x12, 0x01, 0xa1,
|
||||
0xb4, 0xaf, 0x8a, 0xe8, 0x4d, 0x3d, 0xa5, 0x68,
|
||||
0xf7, 0x6d, 0x3e, 0xe0, 0x62, 0x7e, 0x62, 0x66,
|
||||
0xdd, 0x07, 0xe9, 0x36, 0x6f, 0x4d, 0xe9, 0x7a,
|
||||
0x16, 0x48, 0xa1, 0x83, 0x9e, 0x67, 0x4d, 0xa3,
|
||||
0xfe, 0x7e, 0x4a, 0x31, 0xdd, 0xb6, 0x50, 0x39,
|
||||
0xd2, 0x2b, 0x93, 0xf2, 0x4d, 0x51, 0x44, 0x42,
|
||||
0x5d, 0xf1, 0xd9, 0x24, 0xd7, 0xef, 0x4b, 0xa4,
|
||||
0xfd, 0x6a, 0x53, 0xa5, 0x1e, 0x4a, 0xc8, 0x68,
|
||||
0x11, 0x69, 0xc6, 0xbd, 0xe1, 0x59, 0xe4, 0xca,
|
||||
0x5b, 0xa9, 0x77, 0xfe, 0x4f, 0x82, 0x9f, 0xcf,
|
||||
0x55, 0x16, 0x3c, 0xd5, 0x83, 0xee, 0xc7, 0x53,
|
||||
0xaf, 0xca, 0x8a, 0xe2, 0xcf, 0xf1, 0x4b, 0x3b,
|
||||
0x44, 0xf6, 0xc9, 0x6c, 0x5b, 0xd3, 0x28, 0x8a,
|
||||
0x7e, 0x67, 0xaa, 0x9e, 0xad, 0xce, 0x96, 0xc4,
|
||||
0x6e, 0x95, 0x8c, 0xf8, 0xf6, 0xb6, 0x42, 0x8e,
|
||||
0xe7, 0xab, 0xc8, 0x2c, 0x66, 0x8b, 0x80, 0xcf,
|
||||
0x78, 0xfe, 0x35, 0x8b, 0x59, 0x18, 0x45, 0xcb,
|
||||
0x18, 0xd4, 0x09, 0x88, 0xa9, 0xf9, 0x27, 0xd1,
|
||||
0x3b, 0x9d, 0x2b, 0xff, 0x89, 0x21, 0xb0, 0xd2,
|
||||
0xa7, 0x7e, 0x35, 0x61, 0xae, 0x1c, 0xc3, 0x1c,
|
||||
0x07, 0x5c, 0x10, 0x5d, 0x71, 0x3a, 0x3a, 0xe8,
|
||||
0x4c, 0xba, 0x00, 0xde, 0xd1, 0xf9, 0xa1, 0xae,
|
||||
0x7b, 0x91, 0x9d, 0x66, 0x31, 0x18, 0x55, 0x39,
|
||||
0xec, 0x1d, 0x83, 0x85, 0x1e, 0x5b, 0x35, 0x17,
|
||||
0x2e, 0xbc, 0x7a, 0x22, 0x79, 0x09, 0xa7, 0x02,
|
||||
0xf7, 0x3b, 0x93, 0x2c, 0x89, 0x1b, 0x69, 0xde,
|
||||
0x80, 0xc8, 0xdf, 0xce, 0xf9, 0xcd, 0xc8, 0x58,
|
||||
0xd6, 0x4b, 0x65, 0x9a, 0xc4, 0x4f, 0x27, 0xdb,
|
||||
0x9a, 0x6c, 0x3a, 0xef, 0x20, 0x0b, 0x00, 0x5c,
|
||||
0x9f, 0x91, 0xc1, 0xf6, 0x80, 0x53, 0x6c, 0x42,
|
||||
0xe3, 0xd0, 0xfb, 0x3b, 0x23, 0x75, 0x45, 0xa7,
|
||||
0x5b, 0x9b, 0xaa, 0xcd, 0x1e, 0x03, 0x35, 0x68,
|
||||
0x17, 0xee, 0xff, 0xd7, 0x4f, 0x77, 0x2f, 0xd0,
|
||||
0x1d, 0x5e, 0x89, 0x16, 0x50, 0x6f, 0x22, 0x44,
|
||||
0x10, 0x64, 0x37, 0x66, 0x70, 0x7f, 0x4d, 0x58,
|
||||
0x36, 0xec, 0x56, 0x4e, 0xfd, 0x22, 0x8d, 0x77,
|
||||
0xb1, 0x37, 0x07, 0x13, 0xdf, 0x34, 0x40, 0x1c,
|
||||
0x65, 0x95, 0x9b, 0xb9, 0xac, 0x11, 0xfe, 0x7a,
|
||||
0xae, 0x1f, 0x17, 0x94, 0xd4, 0xdd, 0x5b, 0x4f,
|
||||
0x69, 0xa8, 0x04, 0x8e, 0x80, 0x87, 0x7d, 0x96,
|
||||
0x25, 0x37, 0x83, 0x0e, 0xca, 0xa4, 0xb3, 0x29,
|
||||
0x2f, 0x4b, 0x83, 0xa4, 0x01, 0x36, 0x0d, 0xdb,
|
||||
0xd7, 0x6e, 0x7a, 0x9c, 0x3e, 0x82, 0xc8, 0x5f,
|
||||
0x4e, 0xc6, 0xd2, 0x97, 0x64, 0xe6, 0xd9, 0x50,
|
||||
0x89, 0xcb, 0x64, 0x33, 0x28, 0x9c, 0x14, 0xf9,
|
||||
0x41, 0x33, 0x99, 0x0c, 0x87, 0x6f, 0x00, 0x3f,
|
||||
0x00, 0x6f, 0xae, 0xe9, 0x20, 0xc2, 0xcd, 0xb8,
|
||||
0x7a, 0x58, 0xde, 0x57, 0x34, 0xda, 0x63, 0xa1,
|
||||
0x0b, 0x55, 0xfc, 0x54, 0x2a, 0xed, 0xc0, 0xbc,
|
||||
0x29, 0x5f, 0x88, 0x7d, 0x37, 0x3b, 0x48, 0x86,
|
||||
0x3f, 0x88, 0xa2, 0xef, 0x55, 0xe6, 0xc4, 0xf8,
|
||||
0xb8, 0x11, 0x9e, 0x3a, 0x45, 0x79, 0xac, 0x85,
|
||||
0xb2, 0x70, 0x40, 0xd0, 0x66, 0xe7, 0x66, 0xc8,
|
||||
0x8e, 0x8f, 0xde, 0xde, 0xf8, 0x50, 0x79, 0x9e,
|
||||
0x37, 0x04, 0x07, 0x83, 0x5b, 0xe0, 0x68, 0x5b,
|
||||
0x32, 0xbc, 0x6e, 0x50, 0x05, 0xca, 0xf8, 0x3b,
|
||||
0xec, 0x15, 0x13, 0xf8, 0x9a, 0xa2, 0x58, 0x98,
|
||||
0x03, 0x29, 0x83, 0x7f, 0x11, 0xb4, 0x98, 0x41,
|
||||
0xc1, 0xd9, 0x02, 0x6e, 0x2c, 0x45, 0x55, 0xab,
|
||||
0xff, 0xcf, 0x23, 0x80, 0xf0, 0x82, 0x73, 0xe9,
|
||||
0xe6, 0x8f, 0x1a, 0xd9, 0x70, 0xd6, 0x46, 0x1f,
|
||||
0xa8, 0xf8, 0xbd, 0x14, 0xd9, 0x50, 0x59, 0x8e,
|
||||
0x46, 0xbf, 0xe2, 0x8a, 0x8e, 0xce, 0xe7, 0x81,
|
||||
0xf4, 0x3a, 0xd9, 0x07, 0xd8, 0x1d, 0x29, 0x19,
|
||||
0xc1, 0x9d, 0xac, 0x6f, 0xfb, 0xce, 0x95, 0x03,
|
||||
0x29, 0xce, 0x4a, 0x60, 0x34, 0x6a, 0x88, 0xc7,
|
||||
0x5e, 0x8c, 0x71, 0x29, 0x81, 0x64, 0x2f, 0xfb,
|
||||
0xb4, 0x20, 0x08, 0x57, 0xba, 0x50, 0x75, 0x7b,
|
||||
0x1e, 0xfa, 0xcc, 0x60, 0xe7, 0x09, 0xab, 0x4e,
|
||||
0x46, 0x64, 0xfe, 0x17, 0x00, 0x84, 0x8b, 0xca,
|
||||
0xa8, 0xcb, 0x18, 0x5b, 0xa2, 0x04, 0x13, 0x68,
|
||||
0x99, 0x02, 0xaf, 0xcb, 0x75, 0xcb, 0x46, 0x61,
|
||||
0x66, 0x05, 0xd9, 0x5c, 0x6d, 0x8c, 0xf9, 0x8a,
|
||||
0x57, 0xde, 0xf4, 0xb9, 0x5d, 0x51, 0x17, 0x4a,
|
||||
0x8c, 0x42, 0xca, 0x0d, 0x7f, 0x92, 0x69, 0x0d,
|
||||
0x88, 0x2b, 0xc6, 0xee, 0xbd, 0x5a, 0x32, 0x17,
|
||||
0x84, 0xef, 0xf9, 0xd9, 0x51, 0x33, 0x57, 0x2f,
|
||||
0x87, 0xf8, 0xda, 0x3c, 0x3c, 0x14, 0xa9, 0x26,
|
||||
0xad, 0x19, 0xfd, 0x14, 0x5e, 0x33, 0x92, 0xb1,
|
||||
0xe1, 0xd7, 0xfb, 0x1e, 0x55, 0x40, 0xe5, 0x80,
|
||||
0x9b, 0x8e, 0x4b, 0x88, 0x58, 0x77, 0xa9, 0xd2,
|
||||
0xbf, 0x40, 0x90, 0xbe, 0x8f, 0x1f, 0xa7, 0x8a,
|
||||
0xaf, 0x8e, 0x03, 0x93, 0x4d, 0x8a, 0x73, 0x8e,
|
||||
0x76, 0x67, 0x43, 0x37, 0xc1, 0x76, 0x87, 0x50,
|
||||
0x37, 0xc4, 0x02, 0x4a, 0x53, 0x1a, 0x5b, 0xe8,
|
||||
0x5f, 0xc8, 0x28, 0xad, 0xd3, 0x8a, 0x97, 0x53,
|
||||
0xa3, 0xf6, 0x48, 0xba, 0x05, 0x18, 0x56, 0x90,
|
||||
0xa9, 0x95, 0xd8, 0xac, 0xe9, 0xd5, 0x6c, 0xe3,
|
||||
0x1f, 0xd8, 0xfc, 0xc5, 0x27, 0x19, 0xab, 0x4a,
|
||||
0xc4, 0x36, 0xc9, 0xe9, 0xaa, 0x30, 0xef, 0x8e,
|
||||
0x9e, 0x01, 0x18, 0x68, 0xe9, 0x06, 0xf8, 0x54,
|
||||
0xe5, 0xe2, 0xec, 0xde, 0x52, 0xfc, 0x3b, 0xdd,
|
||||
0xe9, 0xc7, 0xc8, 0x2b, 0x93, 0xd4, 0xdb, 0x28,
|
||||
0x72, 0x06, 0x07, 0xd1, 0xba, 0x05, 0x23, 0xa6,
|
||||
0x41, 0x42, 0x55, 0x6a, 0x6e, 0x6f, 0x6c, 0x40,
|
||||
0x6a, 0x19, 0xa4, 0xd5, 0xa2, 0x11, 0xb5, 0x2b,
|
||||
0x16, 0x4a, 0xe3, 0x41, 0xf3, 0xaf, 0x93, 0xbd,
|
||||
0xc8, 0xd9, 0x26, 0x43, 0x71, 0x56, 0xd2, 0x5e,
|
||||
0xf5, 0xa8, 0x3c, 0x64, 0x83, 0x04, 0x89, 0x62,
|
||||
0x20, 0xd3, 0xe9, 0x8e, 0x60, 0xcd, 0xec, 0xd9,
|
||||
0xce, 0x89, 0xf0, 0x5c, 0xf2, 0x26, 0x72, 0x51,
|
||||
0xd5, 0x16, 0x7b, 0xef, 0x19, 0x10, 0xb4, 0xce,
|
||||
0x60, 0x47, 0xab, 0x98, 0x86, 0xbd, 0x39, 0xb7,
|
||||
0xc9, 0x29, 0x38, 0x1a, 0xc1, 0x5c, 0xab, 0x77,
|
||||
0xea, 0xe9, 0xf4, 0x7f, 0x6a, 0x06, 0xf7, 0xc0,
|
||||
0x0b, 0x17, 0x1f, 0x2f, 0xce, 0x07, 0x1b, 0x33,
|
||||
0x68, 0x4d, 0x64, 0x6a, 0x28, 0x6d, 0x1d, 0xc6,
|
||||
0x54, 0x5c, 0xa2, 0x69, 0xf9, 0xb4, 0x62, 0xc9,
|
||||
0x71, 0xf5, 0xd1, 0xb7, 0x7b, 0x02, 0x81, 0x6d,
|
||||
0x4b, 0x1f, 0x62, 0xc5, 0xce, 0x2e, 0xc6, 0x2a,
|
||||
0x1d, 0x6f, 0xc7, 0xc1, 0x99, 0x48, 0x7b, 0xc7,
|
||||
0xf3, 0x53, 0xb7, 0x02, 0x7f, 0x82, 0xda, 0xfa,
|
||||
0xce, 0xd3, 0x54, 0xf8, 0x9b, 0x30, 0x6f, 0xed,
|
||||
0x6c, 0xec, 0x1c, 0x21, 0x49, 0x04, 0x51, 0xae,
|
||||
0xd0, 0x3f, 0xb1, 0xfb, 0x78, 0x1a, 0x6f, 0x35,
|
||||
0xc8, 0x3f, 0x4c, 0x43, 0x71, 0xe9, 0xb8, 0xd7,
|
||||
0x74, 0xca, 0x46, 0x68, 0xeb, 0xd9, 0xa3, 0x94,
|
||||
0x6e, 0x9d, 0xea, 0x57, 0x22, 0x1e, 0x15, 0x27,
|
||||
0x40, 0xd4, 0x0c, 0x32, 0x40, 0xc0, 0x40, 0x8a,
|
||||
0x1e, 0x2e, 0x1a, 0x58, 0x84, 0xa0, 0xc3, 0x68,
|
||||
0x96, 0xfe, 0xb0, 0x96, 0x6c, 0x04, 0x61, 0x35,
|
||||
0x4a, 0x78, 0xc5, 0xeb, 0x50, 0xca, 0xcb, 0x22,
|
||||
0x7b, 0x53, 0x02, 0xfa, 0x63, 0x28, 0x10, 0x68,
|
||||
0x77, 0xab, 0xda, 0x7d, 0xd1, 0xc2, 0x3f, 0x95,
|
||||
0xa6, 0x5a, 0x92, 0x56, 0xb3, 0xb0, 0x29, 0x7e,
|
||||
0x0c, 0xb3, 0xc9, 0x39, 0x0f, 0x1f, 0x51, 0x9d
|
||||
};
|
||||
|
||||
WC_ALLOC_VAR(plain, byte, CHACHA_LEN, NULL);
|
||||
WC_ALLOC_VAR(cipher, byte, CHACHA_LEN, NULL);
|
||||
|
||||
XMEMSET(plain, 0xa5, CHACHA_LEN);
|
||||
for (i = 0; i < (int)sizeof(key); i++) {
|
||||
key[i] = (byte)i;
|
||||
}
|
||||
for (i = 0; i < (int)sizeof(iv); i++) {
|
||||
iv[i] = (byte)(i + 0x40);
|
||||
}
|
||||
|
||||
for (sz = 1; sz < CHACHA_LEN; sz++) {
|
||||
ExpectIntEQ(wc_Chacha_SetKey(&enc, key, (word32)sizeof(key)), 0);
|
||||
ExpectIntEQ(wc_Chacha_SetIV(&enc, iv, 0), 0);
|
||||
|
||||
for (cnt = 0; cnt + sz <= CHACHA_LEN; cnt += sz) {
|
||||
ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt, sz),
|
||||
0);
|
||||
}
|
||||
if (cnt < CHACHA_LEN) {
|
||||
ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt,
|
||||
CHACHA_LEN - cnt), 0);
|
||||
}
|
||||
ExpectBufEQ(cipher, expected, (int)sizeof(expected));
|
||||
}
|
||||
|
||||
WC_FREE_VAR(plain, NULL);
|
||||
WC_FREE_VAR(cipher, NULL);
|
||||
#endif
|
||||
return EXPECT_RESULT();
|
||||
} /* END test_wc_Chacha_Process */
|
||||
|
||||
|
||||
|
||||
@@ -26,9 +26,11 @@
|
||||
|
||||
int test_wc_Chacha_SetKey(void);
|
||||
int test_wc_Chacha_Process(void);
|
||||
int test_wc_Chacha_Process_Chunking(void);
|
||||
|
||||
#define TEST_CHACHA_DECLS \
|
||||
TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \
|
||||
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process)
|
||||
#define TEST_CHACHA_DECLS \
|
||||
TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \
|
||||
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process), \
|
||||
TEST_DECL_GROUP("chacha", test_wc_Chacha_Process_Chunking)
|
||||
|
||||
#endif /* WOLFCRYPT_TEST_CHACHA_H */
|
||||
|
||||
@@ -493,321 +493,233 @@ L_chacha_crypt_bytes_arm64_round_start_320:
|
||||
L_chacha_crypt_bytes_arm64_lt_320:
|
||||
cmp x3, #0x100
|
||||
blt L_chacha_crypt_bytes_arm64_lt_256
|
||||
# Move state into regular register
|
||||
mov x8, v16.d[0]
|
||||
mov x10, v16.d[1]
|
||||
mov x12, v17.d[0]
|
||||
mov x14, v17.d[1]
|
||||
mov x16, v18.d[0]
|
||||
mov x19, v18.d[1]
|
||||
mov x21, v19.d[0]
|
||||
mov x23, v19.d[1]
|
||||
# Move state into vector registers
|
||||
mov v0.16b, v16.16b
|
||||
mov v1.16b, v17.16b
|
||||
lsr x9, x8, #32
|
||||
mov v2.16b, v18.16b
|
||||
add w5, w21, #1
|
||||
mov v3.16b, v19.16b
|
||||
lsr x11, x10, #32
|
||||
mov v4.16b, v16.16b
|
||||
mov v5.16b, v17.16b
|
||||
lsr x13, x12, #32
|
||||
mov v6.16b, v18.16b
|
||||
add w6, w21, #2
|
||||
mov v7.16b, v19.16b
|
||||
lsr x15, x14, #32
|
||||
mov v8.16b, v16.16b
|
||||
mov v9.16b, v17.16b
|
||||
lsr x17, x16, #32
|
||||
mov v10.16b, v18.16b
|
||||
add w21, w21, #3
|
||||
mov v11.16b, v19.16b
|
||||
lsr x20, x19, #32
|
||||
mov v7.s[0], w5
|
||||
lsr x22, x21, #32
|
||||
mov v11.s[0], w6
|
||||
lsr x24, x23, #32
|
||||
add w7, w21, #1
|
||||
dup v0.4s, v16.s[0]
|
||||
dup v1.4s, v16.s[1]
|
||||
dup v2.4s, v16.s[2]
|
||||
dup v3.4s, v16.s[3]
|
||||
dup v4.4s, v17.s[0]
|
||||
dup v5.4s, v17.s[1]
|
||||
dup v6.4s, v17.s[2]
|
||||
dup v7.4s, v17.s[3]
|
||||
dup v8.4s, v18.s[0]
|
||||
dup v9.4s, v18.s[1]
|
||||
dup v10.4s, v18.s[2]
|
||||
dup v11.4s, v18.s[3]
|
||||
dup v12.4s, v19.s[0]
|
||||
dup v13.4s, v19.s[1]
|
||||
dup v14.4s, v19.s[2]
|
||||
dup v15.4s, v19.s[3]
|
||||
# Add to counter word
|
||||
add v12.4s, v12.4s, v28.4s
|
||||
# Set number of odd+even rounds to perform
|
||||
mov x26, #10
|
||||
L_chacha_crypt_bytes_arm64_round_start_256:
|
||||
subs x26, x26, #1
|
||||
# Round odd
|
||||
# a += b; d ^= a; d <<<= 16;
|
||||
add v0.4s, v0.4s, v1.4s
|
||||
add w8, w8, w12
|
||||
add v4.4s, v4.4s, v5.4s
|
||||
add w9, w9, w13
|
||||
add v8.4s, v8.4s, v9.4s
|
||||
add w10, w10, w14
|
||||
eor v3.16b, v3.16b, v0.16b
|
||||
add w11, w11, w15
|
||||
eor v7.16b, v7.16b, v4.16b
|
||||
eor w21, w21, w8
|
||||
eor v11.16b, v11.16b, v8.16b
|
||||
eor w22, w22, w9
|
||||
rev32 v3.8h, v3.8h
|
||||
eor w23, w23, w10
|
||||
rev32 v7.8h, v7.8h
|
||||
eor w24, w24, w11
|
||||
rev32 v11.8h, v11.8h
|
||||
ror w21, w21, #16
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
rev32 v12.8h, v12.8h
|
||||
rev32 v13.8h, v13.8h
|
||||
rev32 v14.8h, v14.8h
|
||||
rev32 v15.8h, v15.8h
|
||||
# c += d; b ^= c; b <<<= 12;
|
||||
add v2.4s, v2.4s, v3.4s
|
||||
ror w22, w22, #16
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
ror w23, w23, #16
|
||||
add v10.4s, v10.4s, v11.4s
|
||||
ror w24, w24, #16
|
||||
eor v20.16b, v1.16b, v2.16b
|
||||
add w16, w16, w21
|
||||
eor v21.16b, v5.16b, v6.16b
|
||||
add w17, w17, w22
|
||||
eor v22.16b, v9.16b, v10.16b
|
||||
add w19, w19, w23
|
||||
shl v1.4s, v20.4s, #12
|
||||
add w20, w20, w24
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
eor v20.16b, v4.16b, v8.16b
|
||||
eor v21.16b, v5.16b, v9.16b
|
||||
eor v22.16b, v6.16b, v10.16b
|
||||
eor v23.16b, v7.16b, v11.16b
|
||||
shl v4.4s, v20.4s, #12
|
||||
shl v5.4s, v21.4s, #12
|
||||
eor w12, w12, w16
|
||||
shl v9.4s, v22.4s, #12
|
||||
eor w13, w13, w17
|
||||
sri v1.4s, v20.4s, #20
|
||||
eor w14, w14, w19
|
||||
shl v6.4s, v22.4s, #12
|
||||
shl v7.4s, v23.4s, #12
|
||||
sri v4.4s, v20.4s, #20
|
||||
sri v5.4s, v21.4s, #20
|
||||
eor w15, w15, w20
|
||||
sri v9.4s, v22.4s, #20
|
||||
ror w12, w12, #20
|
||||
sri v6.4s, v22.4s, #20
|
||||
sri v7.4s, v23.4s, #20
|
||||
# a += b; d ^= a; d <<<= 8;
|
||||
add v0.4s, v0.4s, v1.4s
|
||||
ror w13, w13, #20
|
||||
add v4.4s, v4.4s, v5.4s
|
||||
ror w14, w14, #20
|
||||
add v8.4s, v8.4s, v9.4s
|
||||
ror w15, w15, #20
|
||||
eor v3.16b, v3.16b, v0.16b
|
||||
add w8, w8, w12
|
||||
eor v7.16b, v7.16b, v4.16b
|
||||
add w9, w9, w13
|
||||
eor v11.16b, v11.16b, v8.16b
|
||||
add w10, w10, w14
|
||||
tbl v3.16b, {v3.16b}, v30.16b
|
||||
add w11, w11, w15
|
||||
tbl v7.16b, {v7.16b}, v30.16b
|
||||
eor w21, w21, w8
|
||||
tbl v11.16b, {v11.16b}, v30.16b
|
||||
eor w22, w22, w9
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
tbl v12.16b, {v12.16b}, v30.16b
|
||||
tbl v13.16b, {v13.16b}, v30.16b
|
||||
tbl v14.16b, {v14.16b}, v30.16b
|
||||
tbl v15.16b, {v15.16b}, v30.16b
|
||||
# c += d; b ^= c; b <<<= 7;
|
||||
add v2.4s, v2.4s, v3.4s
|
||||
eor w23, w23, w10
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
eor w24, w24, w11
|
||||
add v10.4s, v10.4s, v11.4s
|
||||
ror w21, w21, #24
|
||||
eor v20.16b, v1.16b, v2.16b
|
||||
ror w22, w22, #24
|
||||
eor v21.16b, v5.16b, v6.16b
|
||||
ror w23, w23, #24
|
||||
eor v22.16b, v9.16b, v10.16b
|
||||
ror w24, w24, #24
|
||||
shl v1.4s, v20.4s, #7
|
||||
add w16, w16, w21
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
eor v20.16b, v4.16b, v8.16b
|
||||
eor v21.16b, v5.16b, v9.16b
|
||||
eor v22.16b, v6.16b, v10.16b
|
||||
eor v23.16b, v7.16b, v11.16b
|
||||
shl v4.4s, v20.4s, #7
|
||||
shl v5.4s, v21.4s, #7
|
||||
add w17, w17, w22
|
||||
shl v9.4s, v22.4s, #7
|
||||
add w19, w19, w23
|
||||
sri v1.4s, v20.4s, #25
|
||||
add w20, w20, w24
|
||||
shl v6.4s, v22.4s, #7
|
||||
shl v7.4s, v23.4s, #7
|
||||
sri v4.4s, v20.4s, #25
|
||||
sri v5.4s, v21.4s, #25
|
||||
eor w12, w12, w16
|
||||
sri v9.4s, v22.4s, #25
|
||||
eor w13, w13, w17
|
||||
ext v3.16b, v3.16b, v3.16b, #12
|
||||
eor w14, w14, w19
|
||||
ext v7.16b, v7.16b, v7.16b, #12
|
||||
eor w15, w15, w20
|
||||
ext v11.16b, v11.16b, v11.16b, #12
|
||||
ror w12, w12, #25
|
||||
ext v1.16b, v1.16b, v1.16b, #4
|
||||
ror w13, w13, #25
|
||||
ext v5.16b, v5.16b, v5.16b, #4
|
||||
ror w14, w14, #25
|
||||
ext v9.16b, v9.16b, v9.16b, #4
|
||||
ror w15, w15, #25
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v10.16b, v10.16b, v10.16b, #8
|
||||
sri v6.4s, v22.4s, #25
|
||||
sri v7.4s, v23.4s, #25
|
||||
# Round even
|
||||
# a += b; d ^= a; d <<<= 16;
|
||||
add v0.4s, v0.4s, v1.4s
|
||||
add w8, w8, w13
|
||||
add v4.4s, v4.4s, v5.4s
|
||||
add w9, w9, w14
|
||||
add v8.4s, v8.4s, v9.4s
|
||||
add w10, w10, w15
|
||||
eor v3.16b, v3.16b, v0.16b
|
||||
add w11, w11, w12
|
||||
eor v7.16b, v7.16b, v4.16b
|
||||
eor w24, w24, w8
|
||||
eor v11.16b, v11.16b, v8.16b
|
||||
eor w21, w21, w9
|
||||
rev32 v3.8h, v3.8h
|
||||
eor w22, w22, w10
|
||||
rev32 v7.8h, v7.8h
|
||||
eor w23, w23, w11
|
||||
rev32 v11.8h, v11.8h
|
||||
ror w24, w24, #16
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
rev32 v15.8h, v15.8h
|
||||
rev32 v12.8h, v12.8h
|
||||
rev32 v13.8h, v13.8h
|
||||
rev32 v14.8h, v14.8h
|
||||
# c += d; b ^= c; b <<<= 12;
|
||||
add v2.4s, v2.4s, v3.4s
|
||||
ror w21, w21, #16
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
ror w22, w22, #16
|
||||
add v10.4s, v10.4s, v11.4s
|
||||
ror w23, w23, #16
|
||||
eor v20.16b, v1.16b, v2.16b
|
||||
add w19, w19, w24
|
||||
eor v21.16b, v5.16b, v6.16b
|
||||
add w20, w20, w21
|
||||
eor v22.16b, v9.16b, v10.16b
|
||||
add w16, w16, w22
|
||||
shl v1.4s, v20.4s, #12
|
||||
add w17, w17, w23
|
||||
shl v5.4s, v21.4s, #12
|
||||
eor w13, w13, w19
|
||||
shl v9.4s, v22.4s, #12
|
||||
eor w14, w14, w20
|
||||
sri v1.4s, v20.4s, #20
|
||||
eor w15, w15, w16
|
||||
sri v5.4s, v21.4s, #20
|
||||
eor w12, w12, w17
|
||||
sri v9.4s, v22.4s, #20
|
||||
ror w13, w13, #20
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
eor v20.16b, v5.16b, v10.16b
|
||||
eor v21.16b, v6.16b, v11.16b
|
||||
eor v22.16b, v7.16b, v8.16b
|
||||
eor v23.16b, v4.16b, v9.16b
|
||||
shl v5.4s, v20.4s, #12
|
||||
shl v6.4s, v21.4s, #12
|
||||
shl v7.4s, v22.4s, #12
|
||||
shl v4.4s, v23.4s, #12
|
||||
sri v5.4s, v20.4s, #20
|
||||
sri v6.4s, v21.4s, #20
|
||||
sri v7.4s, v22.4s, #20
|
||||
sri v4.4s, v23.4s, #20
|
||||
# a += b; d ^= a; d <<<= 8;
|
||||
add v0.4s, v0.4s, v1.4s
|
||||
ror w14, w14, #20
|
||||
add v4.4s, v4.4s, v5.4s
|
||||
ror w15, w15, #20
|
||||
add v8.4s, v8.4s, v9.4s
|
||||
ror w12, w12, #20
|
||||
eor v3.16b, v3.16b, v0.16b
|
||||
add w8, w8, w13
|
||||
eor v7.16b, v7.16b, v4.16b
|
||||
add w9, w9, w14
|
||||
eor v11.16b, v11.16b, v8.16b
|
||||
add w10, w10, w15
|
||||
tbl v3.16b, {v3.16b}, v30.16b
|
||||
add w11, w11, w12
|
||||
tbl v7.16b, {v7.16b}, v30.16b
|
||||
eor w24, w24, w8
|
||||
tbl v11.16b, {v11.16b}, v30.16b
|
||||
eor w21, w21, w9
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
tbl v15.16b, {v15.16b}, v30.16b
|
||||
tbl v12.16b, {v12.16b}, v30.16b
|
||||
tbl v13.16b, {v13.16b}, v30.16b
|
||||
tbl v14.16b, {v14.16b}, v30.16b
|
||||
# c += d; b ^= c; b <<<= 7;
|
||||
add v2.4s, v2.4s, v3.4s
|
||||
eor w22, w22, w10
|
||||
add v6.4s, v6.4s, v7.4s
|
||||
eor w23, w23, w11
|
||||
add v10.4s, v10.4s, v11.4s
|
||||
ror w24, w24, #24
|
||||
eor v20.16b, v1.16b, v2.16b
|
||||
ror w21, w21, #24
|
||||
eor v21.16b, v5.16b, v6.16b
|
||||
ror w22, w22, #24
|
||||
eor v22.16b, v9.16b, v10.16b
|
||||
ror w23, w23, #24
|
||||
shl v1.4s, v20.4s, #7
|
||||
add w19, w19, w24
|
||||
shl v5.4s, v21.4s, #7
|
||||
add w20, w20, w21
|
||||
shl v9.4s, v22.4s, #7
|
||||
add w16, w16, w22
|
||||
sri v1.4s, v20.4s, #25
|
||||
add w17, w17, w23
|
||||
sri v5.4s, v21.4s, #25
|
||||
eor w13, w13, w19
|
||||
sri v9.4s, v22.4s, #25
|
||||
eor w14, w14, w20
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
eor w15, w15, w16
|
||||
ext v7.16b, v7.16b, v7.16b, #4
|
||||
eor w12, w12, w17
|
||||
ext v11.16b, v11.16b, v11.16b, #4
|
||||
ror w13, w13, #25
|
||||
ext v1.16b, v1.16b, v1.16b, #12
|
||||
ror w14, w14, #25
|
||||
ext v5.16b, v5.16b, v5.16b, #12
|
||||
ror w15, w15, #25
|
||||
ext v9.16b, v9.16b, v9.16b, #12
|
||||
ror w12, w12, #25
|
||||
ext v2.16b, v2.16b, v2.16b, #8
|
||||
ext v6.16b, v6.16b, v6.16b, #8
|
||||
ext v10.16b, v10.16b, v10.16b, #8
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
eor v20.16b, v5.16b, v10.16b
|
||||
eor v21.16b, v6.16b, v11.16b
|
||||
eor v22.16b, v7.16b, v8.16b
|
||||
eor v23.16b, v4.16b, v9.16b
|
||||
shl v5.4s, v20.4s, #7
|
||||
shl v6.4s, v21.4s, #7
|
||||
shl v7.4s, v22.4s, #7
|
||||
shl v4.4s, v23.4s, #7
|
||||
sri v5.4s, v20.4s, #25
|
||||
sri v6.4s, v21.4s, #25
|
||||
sri v7.4s, v22.4s, #25
|
||||
sri v4.4s, v23.4s, #25
|
||||
bne L_chacha_crypt_bytes_arm64_round_start_256
|
||||
mov x26, #4
|
||||
# Add counter now rather than after transposed
|
||||
add v12.4s, v12.4s, v28.4s
|
||||
# Load message
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
|
||||
# Add one (2 added during calculating vector results)
|
||||
add w21, w21, #1
|
||||
# Add back state, XOR msg, store (load next block)
|
||||
add v0.4s, v0.4s, v16.4s
|
||||
add v1.4s, v1.4s, v17.4s
|
||||
add v2.4s, v2.4s, v18.4s
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
eor v0.16b, v0.16b, v20.16b
|
||||
eor v1.16b, v1.16b, v21.16b
|
||||
eor v2.16b, v2.16b, v22.16b
|
||||
eor v3.16b, v3.16b, v23.16b
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40
|
||||
mov v19.s[0], w5
|
||||
add v4.4s, v4.4s, v16.4s
|
||||
add v5.4s, v5.4s, v17.4s
|
||||
add v6.4s, v6.4s, v18.4s
|
||||
add v7.4s, v7.4s, v19.4s
|
||||
eor v4.16b, v4.16b, v20.16b
|
||||
eor v5.16b, v5.16b, v21.16b
|
||||
eor v6.16b, v6.16b, v22.16b
|
||||
eor v7.16b, v7.16b, v23.16b
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #0x40
|
||||
mov v19.s[0], w6
|
||||
add v8.4s, v8.4s, v16.4s
|
||||
add v9.4s, v9.4s, v17.4s
|
||||
add v10.4s, v10.4s, v18.4s
|
||||
add v11.4s, v11.4s, v19.4s
|
||||
eor v8.16b, v8.16b, v20.16b
|
||||
eor v9.16b, v9.16b, v21.16b
|
||||
eor v10.16b, v10.16b, v22.16b
|
||||
eor v11.16b, v11.16b, v23.16b
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x1], #0x40
|
||||
# Move regular registers into vector registers for adding and xor
|
||||
orr x8, x8, x9, lsl 32
|
||||
orr x10, x10, x11, lsl 32
|
||||
orr x12, x12, x13, lsl 32
|
||||
mov v0.d[0], x8
|
||||
orr x14, x14, x15, lsl 32
|
||||
mov v0.d[1], x10
|
||||
orr x16, x16, x17, lsl 32
|
||||
mov v1.d[0], x12
|
||||
orr x19, x19, x20, lsl 32
|
||||
mov v1.d[1], x14
|
||||
orr x21, x21, x22, lsl 32
|
||||
mov v2.d[0], x16
|
||||
orr x23, x23, x24, lsl 32
|
||||
mov v2.d[1], x19
|
||||
mov v3.d[0], x21
|
||||
mov v3.d[1], x23
|
||||
# Add back state, XOR in message and store
|
||||
add v0.4s, v0.4s, v16.4s
|
||||
add v1.4s, v1.4s, v17.4s
|
||||
add v2.4s, v2.4s, v18.4s
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
eor v0.16b, v0.16b, v20.16b
|
||||
eor v1.16b, v1.16b, v21.16b
|
||||
eor v2.16b, v2.16b, v22.16b
|
||||
eor v3.16b, v3.16b, v23.16b
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40
|
||||
mov v19.d[0], x7
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
|
||||
# Transpose vectors
|
||||
trn1 v20.4s, v0.4s, v1.4s
|
||||
trn1 v22.4s, v2.4s, v3.4s
|
||||
trn2 v21.4s, v0.4s, v1.4s
|
||||
trn2 v23.4s, v2.4s, v3.4s
|
||||
trn1 v0.2d, v20.2d, v22.2d
|
||||
trn1 v1.2d, v21.2d, v23.2d
|
||||
trn2 v2.2d, v20.2d, v22.2d
|
||||
trn2 v3.2d, v21.2d, v23.2d
|
||||
trn1 v20.4s, v4.4s, v5.4s
|
||||
trn1 v22.4s, v6.4s, v7.4s
|
||||
trn2 v21.4s, v4.4s, v5.4s
|
||||
trn2 v23.4s, v6.4s, v7.4s
|
||||
trn1 v4.2d, v20.2d, v22.2d
|
||||
trn1 v5.2d, v21.2d, v23.2d
|
||||
trn2 v6.2d, v20.2d, v22.2d
|
||||
trn2 v7.2d, v21.2d, v23.2d
|
||||
trn1 v20.4s, v8.4s, v9.4s
|
||||
trn1 v22.4s, v10.4s, v11.4s
|
||||
trn2 v21.4s, v8.4s, v9.4s
|
||||
trn2 v23.4s, v10.4s, v11.4s
|
||||
trn1 v8.2d, v20.2d, v22.2d
|
||||
trn1 v9.2d, v21.2d, v23.2d
|
||||
trn2 v10.2d, v20.2d, v22.2d
|
||||
trn2 v11.2d, v21.2d, v23.2d
|
||||
trn1 v20.4s, v12.4s, v13.4s
|
||||
trn1 v22.4s, v14.4s, v15.4s
|
||||
trn2 v21.4s, v12.4s, v13.4s
|
||||
trn2 v23.4s, v14.4s, v15.4s
|
||||
trn1 v12.2d, v20.2d, v22.2d
|
||||
trn1 v13.2d, v21.2d, v23.2d
|
||||
trn2 v14.2d, v20.2d, v22.2d
|
||||
trn2 v15.2d, v21.2d, v23.2d
|
||||
# Add back state, XOR in message and store (load next block)
|
||||
add v20.4s, v0.4s, v16.4s
|
||||
add v21.4s, v4.4s, v17.4s
|
||||
add v22.4s, v8.4s, v18.4s
|
||||
add v23.4s, v12.4s, v19.4s
|
||||
eor v20.16b, v20.16b, v24.16b
|
||||
eor v21.16b, v21.16b, v25.16b
|
||||
eor v22.16b, v22.16b, v26.16b
|
||||
eor v23.16b, v23.16b, v27.16b
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
|
||||
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
|
||||
add v20.4s, v1.4s, v16.4s
|
||||
add v21.4s, v5.4s, v17.4s
|
||||
add v22.4s, v9.4s, v18.4s
|
||||
add v23.4s, v13.4s, v19.4s
|
||||
eor v20.16b, v20.16b, v24.16b
|
||||
eor v21.16b, v21.16b, v25.16b
|
||||
eor v22.16b, v22.16b, v26.16b
|
||||
eor v23.16b, v23.16b, v27.16b
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
|
||||
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
|
||||
add v20.4s, v2.4s, v16.4s
|
||||
add v21.4s, v6.4s, v17.4s
|
||||
add v22.4s, v10.4s, v18.4s
|
||||
add v23.4s, v14.4s, v19.4s
|
||||
eor v20.16b, v20.16b, v24.16b
|
||||
eor v21.16b, v21.16b, v25.16b
|
||||
eor v22.16b, v22.16b, v26.16b
|
||||
eor v23.16b, v23.16b, v27.16b
|
||||
ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
|
||||
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
|
||||
add v20.4s, v3.4s, v16.4s
|
||||
add v21.4s, v7.4s, v17.4s
|
||||
add v22.4s, v11.4s, v18.4s
|
||||
add v23.4s, v15.4s, v19.4s
|
||||
eor v20.16b, v20.16b, v24.16b
|
||||
eor v21.16b, v21.16b, v25.16b
|
||||
eor v22.16b, v22.16b, v26.16b
|
||||
eor v23.16b, v23.16b, v27.16b
|
||||
st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
|
||||
mov v29.s[0], w26
|
||||
sub x3, x3, #0x100
|
||||
add v19.4s, v19.4s, v29.4s
|
||||
# Done 256-byte block
|
||||
L_chacha_crypt_bytes_arm64_lt_256:
|
||||
cmp x3, #0x80
|
||||
|
||||
@@ -439,42 +439,25 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
|
||||
"L_chacha_crypt_bytes_arm64_lt_320_%=: \n\t"
|
||||
"cmp %w[len], #0x100\n\t"
|
||||
"b.lt L_chacha_crypt_bytes_arm64_lt_256_%=\n\t"
|
||||
/* Move state into regular register */
|
||||
"mov x8, v16.d[0]\n\t"
|
||||
"mov x10, v16.d[1]\n\t"
|
||||
"mov x12, v17.d[0]\n\t"
|
||||
"mov x14, v17.d[1]\n\t"
|
||||
"mov x16, v18.d[0]\n\t"
|
||||
"mov x19, v18.d[1]\n\t"
|
||||
"mov x21, v19.d[0]\n\t"
|
||||
"mov x23, v19.d[1]\n\t"
|
||||
/* Move state into vector registers */
|
||||
"mov v0.16b, v16.16b\n\t"
|
||||
"mov v1.16b, v17.16b\n\t"
|
||||
"lsr x9, x8, #32\n\t"
|
||||
"mov v2.16b, v18.16b\n\t"
|
||||
"add %w[rol8], w21, #1\n\t"
|
||||
"mov v3.16b, v19.16b\n\t"
|
||||
"lsr x11, x10, #32\n\t"
|
||||
"mov v4.16b, v16.16b\n\t"
|
||||
"mov v5.16b, v17.16b\n\t"
|
||||
"lsr x13, x12, #32\n\t"
|
||||
"mov v6.16b, v18.16b\n\t"
|
||||
"add %w[ctr], w21, #2\n\t"
|
||||
"mov v7.16b, v19.16b\n\t"
|
||||
"lsr x15, x14, #32\n\t"
|
||||
"mov v8.16b, v16.16b\n\t"
|
||||
"mov v9.16b, v17.16b\n\t"
|
||||
"lsr x17, x16, #32\n\t"
|
||||
"mov v10.16b, v18.16b\n\t"
|
||||
"add w21, w21, #3\n\t"
|
||||
"mov v11.16b, v19.16b\n\t"
|
||||
"lsr x20, x19, #32\n\t"
|
||||
"mov v7.s[0], %w[rol8]\n\t"
|
||||
"lsr x22, x21, #32\n\t"
|
||||
"mov v11.s[0], %w[ctr]\n\t"
|
||||
"lsr x24, x23, #32\n\t"
|
||||
"add w7, w21, #1\n\t"
|
||||
"dup v0.4s, v16.s[0]\n\t"
|
||||
"dup v1.4s, v16.s[1]\n\t"
|
||||
"dup v2.4s, v16.s[2]\n\t"
|
||||
"dup v3.4s, v16.s[3]\n\t"
|
||||
"dup v4.4s, v17.s[0]\n\t"
|
||||
"dup v5.4s, v17.s[1]\n\t"
|
||||
"dup v6.4s, v17.s[2]\n\t"
|
||||
"dup v7.4s, v17.s[3]\n\t"
|
||||
"dup v8.4s, v18.s[0]\n\t"
|
||||
"dup v9.4s, v18.s[1]\n\t"
|
||||
"dup v10.4s, v18.s[2]\n\t"
|
||||
"dup v11.4s, v18.s[3]\n\t"
|
||||
"dup v12.4s, v19.s[0]\n\t"
|
||||
"dup v13.4s, v19.s[1]\n\t"
|
||||
"dup v14.4s, v19.s[2]\n\t"
|
||||
"dup v15.4s, v19.s[3]\n\t"
|
||||
/* Add to counter word */
|
||||
"add v12.4s, v12.4s, v28.4s\n\t"
|
||||
/* Set number of odd+even rounds to perform */
|
||||
"mov x26, #10\n\t"
|
||||
"\n"
|
||||
@@ -482,279 +465,208 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
|
||||
"subs x26, x26, #1\n\t"
|
||||
/* Round odd */
|
||||
/* a += b; d ^= a; d <<<= 16; */
|
||||
"add v0.4s, v0.4s, v1.4s\n\t"
|
||||
"add w8, w8, w12\n\t"
|
||||
"add v4.4s, v4.4s, v5.4s\n\t"
|
||||
"add w9, w9, w13\n\t"
|
||||
"add v8.4s, v8.4s, v9.4s\n\t"
|
||||
"add w10, w10, w14\n\t"
|
||||
"eor v3.16b, v3.16b, v0.16b\n\t"
|
||||
"add w11, w11, w15\n\t"
|
||||
"eor v7.16b, v7.16b, v4.16b\n\t"
|
||||
"eor w21, w21, w8\n\t"
|
||||
"eor v11.16b, v11.16b, v8.16b\n\t"
|
||||
"eor w22, w22, w9\n\t"
|
||||
"rev32 v3.8h, v3.8h\n\t"
|
||||
"eor w23, w23, w10\n\t"
|
||||
"rev32 v7.8h, v7.8h\n\t"
|
||||
"eor w24, w24, w11\n\t"
|
||||
"rev32 v11.8h, v11.8h\n\t"
|
||||
"ror w21, w21, #16\n\t"
|
||||
"add v0.4s, v0.4s, v4.4s\n\t"
|
||||
"add v1.4s, v1.4s, v5.4s\n\t"
|
||||
"add v2.4s, v2.4s, v6.4s\n\t"
|
||||
"add v3.4s, v3.4s, v7.4s\n\t"
|
||||
"eor v12.16b, v12.16b, v0.16b\n\t"
|
||||
"eor v13.16b, v13.16b, v1.16b\n\t"
|
||||
"eor v14.16b, v14.16b, v2.16b\n\t"
|
||||
"eor v15.16b, v15.16b, v3.16b\n\t"
|
||||
"rev32 v12.8h, v12.8h\n\t"
|
||||
"rev32 v13.8h, v13.8h\n\t"
|
||||
"rev32 v14.8h, v14.8h\n\t"
|
||||
"rev32 v15.8h, v15.8h\n\t"
|
||||
/* c += d; b ^= c; b <<<= 12; */
|
||||
"add v2.4s, v2.4s, v3.4s\n\t"
|
||||
"ror w22, w22, #16\n\t"
|
||||
"add v6.4s, v6.4s, v7.4s\n\t"
|
||||
"ror w23, w23, #16\n\t"
|
||||
"add v10.4s, v10.4s, v11.4s\n\t"
|
||||
"ror w24, w24, #16\n\t"
|
||||
"eor v20.16b, v1.16b, v2.16b\n\t"
|
||||
"add w16, w16, w21\n\t"
|
||||
"eor v21.16b, v5.16b, v6.16b\n\t"
|
||||
"add w17, w17, w22\n\t"
|
||||
"eor v22.16b, v9.16b, v10.16b\n\t"
|
||||
"add w19, w19, w23\n\t"
|
||||
"shl v1.4s, v20.4s, #12\n\t"
|
||||
"add w20, w20, w24\n\t"
|
||||
"add v8.4s, v8.4s, v12.4s\n\t"
|
||||
"add v9.4s, v9.4s, v13.4s\n\t"
|
||||
"add v10.4s, v10.4s, v14.4s\n\t"
|
||||
"add v11.4s, v11.4s, v15.4s\n\t"
|
||||
"eor v20.16b, v4.16b, v8.16b\n\t"
|
||||
"eor v21.16b, v5.16b, v9.16b\n\t"
|
||||
"eor v22.16b, v6.16b, v10.16b\n\t"
|
||||
"eor v23.16b, v7.16b, v11.16b\n\t"
|
||||
"shl v4.4s, v20.4s, #12\n\t"
|
||||
"shl v5.4s, v21.4s, #12\n\t"
|
||||
"eor w12, w12, w16\n\t"
|
||||
"shl v9.4s, v22.4s, #12\n\t"
|
||||
"eor w13, w13, w17\n\t"
|
||||
"sri v1.4s, v20.4s, #20\n\t"
|
||||
"eor w14, w14, w19\n\t"
|
||||
"shl v6.4s, v22.4s, #12\n\t"
|
||||
"shl v7.4s, v23.4s, #12\n\t"
|
||||
"sri v4.4s, v20.4s, #20\n\t"
|
||||
"sri v5.4s, v21.4s, #20\n\t"
|
||||
"eor w15, w15, w20\n\t"
|
||||
"sri v9.4s, v22.4s, #20\n\t"
|
||||
"ror w12, w12, #20\n\t"
|
||||
"sri v6.4s, v22.4s, #20\n\t"
|
||||
"sri v7.4s, v23.4s, #20\n\t"
|
||||
/* a += b; d ^= a; d <<<= 8; */
|
||||
"add v0.4s, v0.4s, v1.4s\n\t"
|
||||
"ror w13, w13, #20\n\t"
|
||||
"add v4.4s, v4.4s, v5.4s\n\t"
|
||||
"ror w14, w14, #20\n\t"
|
||||
"add v8.4s, v8.4s, v9.4s\n\t"
|
||||
"ror w15, w15, #20\n\t"
|
||||
"eor v3.16b, v3.16b, v0.16b\n\t"
|
||||
"add w8, w8, w12\n\t"
|
||||
"eor v7.16b, v7.16b, v4.16b\n\t"
|
||||
"add w9, w9, w13\n\t"
|
||||
"eor v11.16b, v11.16b, v8.16b\n\t"
|
||||
"add w10, w10, w14\n\t"
|
||||
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
|
||||
"add w11, w11, w15\n\t"
|
||||
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
|
||||
"eor w21, w21, w8\n\t"
|
||||
"tbl v11.16b, {v11.16b}, v30.16b\n\t"
|
||||
"eor w22, w22, w9\n\t"
|
||||
"add v0.4s, v0.4s, v4.4s\n\t"
|
||||
"add v1.4s, v1.4s, v5.4s\n\t"
|
||||
"add v2.4s, v2.4s, v6.4s\n\t"
|
||||
"add v3.4s, v3.4s, v7.4s\n\t"
|
||||
"eor v12.16b, v12.16b, v0.16b\n\t"
|
||||
"eor v13.16b, v13.16b, v1.16b\n\t"
|
||||
"eor v14.16b, v14.16b, v2.16b\n\t"
|
||||
"eor v15.16b, v15.16b, v3.16b\n\t"
|
||||
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
|
||||
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
|
||||
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
|
||||
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
"add v2.4s, v2.4s, v3.4s\n\t"
|
||||
"eor w23, w23, w10\n\t"
|
||||
"add v6.4s, v6.4s, v7.4s\n\t"
|
||||
"eor w24, w24, w11\n\t"
|
||||
"add v10.4s, v10.4s, v11.4s\n\t"
|
||||
"ror w21, w21, #24\n\t"
|
||||
"eor v20.16b, v1.16b, v2.16b\n\t"
|
||||
"ror w22, w22, #24\n\t"
|
||||
"eor v21.16b, v5.16b, v6.16b\n\t"
|
||||
"ror w23, w23, #24\n\t"
|
||||
"eor v22.16b, v9.16b, v10.16b\n\t"
|
||||
"ror w24, w24, #24\n\t"
|
||||
"shl v1.4s, v20.4s, #7\n\t"
|
||||
"add w16, w16, w21\n\t"
|
||||
"add v8.4s, v8.4s, v12.4s\n\t"
|
||||
"add v9.4s, v9.4s, v13.4s\n\t"
|
||||
"add v10.4s, v10.4s, v14.4s\n\t"
|
||||
"add v11.4s, v11.4s, v15.4s\n\t"
|
||||
"eor v20.16b, v4.16b, v8.16b\n\t"
|
||||
"eor v21.16b, v5.16b, v9.16b\n\t"
|
||||
"eor v22.16b, v6.16b, v10.16b\n\t"
|
||||
"eor v23.16b, v7.16b, v11.16b\n\t"
|
||||
"shl v4.4s, v20.4s, #7\n\t"
|
||||
"shl v5.4s, v21.4s, #7\n\t"
|
||||
"add w17, w17, w22\n\t"
|
||||
"shl v9.4s, v22.4s, #7\n\t"
|
||||
"add w19, w19, w23\n\t"
|
||||
"sri v1.4s, v20.4s, #25\n\t"
|
||||
"add w20, w20, w24\n\t"
|
||||
"shl v6.4s, v22.4s, #7\n\t"
|
||||
"shl v7.4s, v23.4s, #7\n\t"
|
||||
"sri v4.4s, v20.4s, #25\n\t"
|
||||
"sri v5.4s, v21.4s, #25\n\t"
|
||||
"eor w12, w12, w16\n\t"
|
||||
"sri v9.4s, v22.4s, #25\n\t"
|
||||
"eor w13, w13, w17\n\t"
|
||||
"ext v3.16b, v3.16b, v3.16b, #12\n\t"
|
||||
"eor w14, w14, w19\n\t"
|
||||
"ext v7.16b, v7.16b, v7.16b, #12\n\t"
|
||||
"eor w15, w15, w20\n\t"
|
||||
"ext v11.16b, v11.16b, v11.16b, #12\n\t"
|
||||
"ror w12, w12, #25\n\t"
|
||||
"ext v1.16b, v1.16b, v1.16b, #4\n\t"
|
||||
"ror w13, w13, #25\n\t"
|
||||
"ext v5.16b, v5.16b, v5.16b, #4\n\t"
|
||||
"ror w14, w14, #25\n\t"
|
||||
"ext v9.16b, v9.16b, v9.16b, #4\n\t"
|
||||
"ror w15, w15, #25\n\t"
|
||||
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
|
||||
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
|
||||
"ext v10.16b, v10.16b, v10.16b, #8\n\t"
|
||||
"sri v6.4s, v22.4s, #25\n\t"
|
||||
"sri v7.4s, v23.4s, #25\n\t"
|
||||
/* Round even */
|
||||
/* a += b; d ^= a; d <<<= 16; */
|
||||
"add v0.4s, v0.4s, v1.4s\n\t"
|
||||
"add w8, w8, w13\n\t"
|
||||
"add v4.4s, v4.4s, v5.4s\n\t"
|
||||
"add w9, w9, w14\n\t"
|
||||
"add v8.4s, v8.4s, v9.4s\n\t"
|
||||
"add w10, w10, w15\n\t"
|
||||
"eor v3.16b, v3.16b, v0.16b\n\t"
|
||||
"add w11, w11, w12\n\t"
|
||||
"eor v7.16b, v7.16b, v4.16b\n\t"
|
||||
"eor w24, w24, w8\n\t"
|
||||
"eor v11.16b, v11.16b, v8.16b\n\t"
|
||||
"eor w21, w21, w9\n\t"
|
||||
"rev32 v3.8h, v3.8h\n\t"
|
||||
"eor w22, w22, w10\n\t"
|
||||
"rev32 v7.8h, v7.8h\n\t"
|
||||
"eor w23, w23, w11\n\t"
|
||||
"rev32 v11.8h, v11.8h\n\t"
|
||||
"ror w24, w24, #16\n\t"
|
||||
"add v0.4s, v0.4s, v5.4s\n\t"
|
||||
"add v1.4s, v1.4s, v6.4s\n\t"
|
||||
"add v2.4s, v2.4s, v7.4s\n\t"
|
||||
"add v3.4s, v3.4s, v4.4s\n\t"
|
||||
"eor v15.16b, v15.16b, v0.16b\n\t"
|
||||
"eor v12.16b, v12.16b, v1.16b\n\t"
|
||||
"eor v13.16b, v13.16b, v2.16b\n\t"
|
||||
"eor v14.16b, v14.16b, v3.16b\n\t"
|
||||
"rev32 v15.8h, v15.8h\n\t"
|
||||
"rev32 v12.8h, v12.8h\n\t"
|
||||
"rev32 v13.8h, v13.8h\n\t"
|
||||
"rev32 v14.8h, v14.8h\n\t"
|
||||
/* c += d; b ^= c; b <<<= 12; */
|
||||
"add v2.4s, v2.4s, v3.4s\n\t"
|
||||
"ror w21, w21, #16\n\t"
|
||||
"add v6.4s, v6.4s, v7.4s\n\t"
|
||||
"ror w22, w22, #16\n\t"
|
||||
"add v10.4s, v10.4s, v11.4s\n\t"
|
||||
"ror w23, w23, #16\n\t"
|
||||
"eor v20.16b, v1.16b, v2.16b\n\t"
|
||||
"add w19, w19, w24\n\t"
|
||||
"eor v21.16b, v5.16b, v6.16b\n\t"
|
||||
"add w20, w20, w21\n\t"
|
||||
"eor v22.16b, v9.16b, v10.16b\n\t"
|
||||
"add w16, w16, w22\n\t"
|
||||
"shl v1.4s, v20.4s, #12\n\t"
|
||||
"add w17, w17, w23\n\t"
|
||||
"shl v5.4s, v21.4s, #12\n\t"
|
||||
"eor w13, w13, w19\n\t"
|
||||
"shl v9.4s, v22.4s, #12\n\t"
|
||||
"eor w14, w14, w20\n\t"
|
||||
"sri v1.4s, v20.4s, #20\n\t"
|
||||
"eor w15, w15, w16\n\t"
|
||||
"sri v5.4s, v21.4s, #20\n\t"
|
||||
"eor w12, w12, w17\n\t"
|
||||
"sri v9.4s, v22.4s, #20\n\t"
|
||||
"ror w13, w13, #20\n\t"
|
||||
"add v10.4s, v10.4s, v15.4s\n\t"
|
||||
"add v11.4s, v11.4s, v12.4s\n\t"
|
||||
"add v8.4s, v8.4s, v13.4s\n\t"
|
||||
"add v9.4s, v9.4s, v14.4s\n\t"
|
||||
"eor v20.16b, v5.16b, v10.16b\n\t"
|
||||
"eor v21.16b, v6.16b, v11.16b\n\t"
|
||||
"eor v22.16b, v7.16b, v8.16b\n\t"
|
||||
"eor v23.16b, v4.16b, v9.16b\n\t"
|
||||
"shl v5.4s, v20.4s, #12\n\t"
|
||||
"shl v6.4s, v21.4s, #12\n\t"
|
||||
"shl v7.4s, v22.4s, #12\n\t"
|
||||
"shl v4.4s, v23.4s, #12\n\t"
|
||||
"sri v5.4s, v20.4s, #20\n\t"
|
||||
"sri v6.4s, v21.4s, #20\n\t"
|
||||
"sri v7.4s, v22.4s, #20\n\t"
|
||||
"sri v4.4s, v23.4s, #20\n\t"
|
||||
/* a += b; d ^= a; d <<<= 8; */
|
||||
"add v0.4s, v0.4s, v1.4s\n\t"
|
||||
"ror w14, w14, #20\n\t"
|
||||
"add v4.4s, v4.4s, v5.4s\n\t"
|
||||
"ror w15, w15, #20\n\t"
|
||||
"add v8.4s, v8.4s, v9.4s\n\t"
|
||||
"ror w12, w12, #20\n\t"
|
||||
"eor v3.16b, v3.16b, v0.16b\n\t"
|
||||
"add w8, w8, w13\n\t"
|
||||
"eor v7.16b, v7.16b, v4.16b\n\t"
|
||||
"add w9, w9, w14\n\t"
|
||||
"eor v11.16b, v11.16b, v8.16b\n\t"
|
||||
"add w10, w10, w15\n\t"
|
||||
"tbl v3.16b, {v3.16b}, v30.16b\n\t"
|
||||
"add w11, w11, w12\n\t"
|
||||
"tbl v7.16b, {v7.16b}, v30.16b\n\t"
|
||||
"eor w24, w24, w8\n\t"
|
||||
"tbl v11.16b, {v11.16b}, v30.16b\n\t"
|
||||
"eor w21, w21, w9\n\t"
|
||||
"add v0.4s, v0.4s, v5.4s\n\t"
|
||||
"add v1.4s, v1.4s, v6.4s\n\t"
|
||||
"add v2.4s, v2.4s, v7.4s\n\t"
|
||||
"add v3.4s, v3.4s, v4.4s\n\t"
|
||||
"eor v15.16b, v15.16b, v0.16b\n\t"
|
||||
"eor v12.16b, v12.16b, v1.16b\n\t"
|
||||
"eor v13.16b, v13.16b, v2.16b\n\t"
|
||||
"eor v14.16b, v14.16b, v3.16b\n\t"
|
||||
"tbl v15.16b, {v15.16b}, v30.16b\n\t"
|
||||
"tbl v12.16b, {v12.16b}, v30.16b\n\t"
|
||||
"tbl v13.16b, {v13.16b}, v30.16b\n\t"
|
||||
"tbl v14.16b, {v14.16b}, v30.16b\n\t"
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
"add v2.4s, v2.4s, v3.4s\n\t"
|
||||
"eor w22, w22, w10\n\t"
|
||||
"add v6.4s, v6.4s, v7.4s\n\t"
|
||||
"eor w23, w23, w11\n\t"
|
||||
"add v10.4s, v10.4s, v11.4s\n\t"
|
||||
"ror w24, w24, #24\n\t"
|
||||
"eor v20.16b, v1.16b, v2.16b\n\t"
|
||||
"ror w21, w21, #24\n\t"
|
||||
"eor v21.16b, v5.16b, v6.16b\n\t"
|
||||
"ror w22, w22, #24\n\t"
|
||||
"eor v22.16b, v9.16b, v10.16b\n\t"
|
||||
"ror w23, w23, #24\n\t"
|
||||
"shl v1.4s, v20.4s, #7\n\t"
|
||||
"add w19, w19, w24\n\t"
|
||||
"shl v5.4s, v21.4s, #7\n\t"
|
||||
"add w20, w20, w21\n\t"
|
||||
"shl v9.4s, v22.4s, #7\n\t"
|
||||
"add w16, w16, w22\n\t"
|
||||
"sri v1.4s, v20.4s, #25\n\t"
|
||||
"add w17, w17, w23\n\t"
|
||||
"sri v5.4s, v21.4s, #25\n\t"
|
||||
"eor w13, w13, w19\n\t"
|
||||
"sri v9.4s, v22.4s, #25\n\t"
|
||||
"eor w14, w14, w20\n\t"
|
||||
"ext v3.16b, v3.16b, v3.16b, #4\n\t"
|
||||
"eor w15, w15, w16\n\t"
|
||||
"ext v7.16b, v7.16b, v7.16b, #4\n\t"
|
||||
"eor w12, w12, w17\n\t"
|
||||
"ext v11.16b, v11.16b, v11.16b, #4\n\t"
|
||||
"ror w13, w13, #25\n\t"
|
||||
"ext v1.16b, v1.16b, v1.16b, #12\n\t"
|
||||
"ror w14, w14, #25\n\t"
|
||||
"ext v5.16b, v5.16b, v5.16b, #12\n\t"
|
||||
"ror w15, w15, #25\n\t"
|
||||
"ext v9.16b, v9.16b, v9.16b, #12\n\t"
|
||||
"ror w12, w12, #25\n\t"
|
||||
"ext v2.16b, v2.16b, v2.16b, #8\n\t"
|
||||
"ext v6.16b, v6.16b, v6.16b, #8\n\t"
|
||||
"ext v10.16b, v10.16b, v10.16b, #8\n\t"
|
||||
"add v10.4s, v10.4s, v15.4s\n\t"
|
||||
"add v11.4s, v11.4s, v12.4s\n\t"
|
||||
"add v8.4s, v8.4s, v13.4s\n\t"
|
||||
"add v9.4s, v9.4s, v14.4s\n\t"
|
||||
"eor v20.16b, v5.16b, v10.16b\n\t"
|
||||
"eor v21.16b, v6.16b, v11.16b\n\t"
|
||||
"eor v22.16b, v7.16b, v8.16b\n\t"
|
||||
"eor v23.16b, v4.16b, v9.16b\n\t"
|
||||
"shl v5.4s, v20.4s, #7\n\t"
|
||||
"shl v6.4s, v21.4s, #7\n\t"
|
||||
"shl v7.4s, v22.4s, #7\n\t"
|
||||
"shl v4.4s, v23.4s, #7\n\t"
|
||||
"sri v5.4s, v20.4s, #25\n\t"
|
||||
"sri v6.4s, v21.4s, #25\n\t"
|
||||
"sri v7.4s, v22.4s, #25\n\t"
|
||||
"sri v4.4s, v23.4s, #25\n\t"
|
||||
"b.ne L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t"
|
||||
"mov x26, #4\n\t"
|
||||
/* Add counter now rather than after transposed */
|
||||
"add v12.4s, v12.4s, v28.4s\n\t"
|
||||
/* Load message */
|
||||
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
|
||||
/* Add one (2 added during calculating vector results) */
|
||||
"add w21, w21, #1\n\t"
|
||||
/* Add back state, XOR msg, store (load next block) */
|
||||
"add v0.4s, v0.4s, v16.4s\n\t"
|
||||
"add v1.4s, v1.4s, v17.4s\n\t"
|
||||
"add v2.4s, v2.4s, v18.4s\n\t"
|
||||
"add v3.4s, v3.4s, v19.4s\n\t"
|
||||
"eor v0.16b, v0.16b, v20.16b\n\t"
|
||||
"eor v1.16b, v1.16b, v21.16b\n\t"
|
||||
"eor v2.16b, v2.16b, v22.16b\n\t"
|
||||
"eor v3.16b, v3.16b, v23.16b\n\t"
|
||||
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
|
||||
"mov v19.s[0], %w[rol8]\n\t"
|
||||
"add v4.4s, v4.4s, v16.4s\n\t"
|
||||
"add v5.4s, v5.4s, v17.4s\n\t"
|
||||
"add v6.4s, v6.4s, v18.4s\n\t"
|
||||
"add v7.4s, v7.4s, v19.4s\n\t"
|
||||
"eor v4.16b, v4.16b, v20.16b\n\t"
|
||||
"eor v5.16b, v5.16b, v21.16b\n\t"
|
||||
"eor v6.16b, v6.16b, v22.16b\n\t"
|
||||
"eor v7.16b, v7.16b, v23.16b\n\t"
|
||||
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%x[c]], #0x40\n\t"
|
||||
"mov v19.s[0], %w[ctr]\n\t"
|
||||
"add v8.4s, v8.4s, v16.4s\n\t"
|
||||
"add v9.4s, v9.4s, v17.4s\n\t"
|
||||
"add v10.4s, v10.4s, v18.4s\n\t"
|
||||
"add v11.4s, v11.4s, v19.4s\n\t"
|
||||
"eor v8.16b, v8.16b, v20.16b\n\t"
|
||||
"eor v9.16b, v9.16b, v21.16b\n\t"
|
||||
"eor v10.16b, v10.16b, v22.16b\n\t"
|
||||
"eor v11.16b, v11.16b, v23.16b\n\t"
|
||||
"ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%x[c]], #0x40\n\t"
|
||||
/* Move regular registers into vector registers for adding and xor */
|
||||
"orr x8, x8, x9, lsl 32\n\t"
|
||||
"orr x10, x10, x11, lsl 32\n\t"
|
||||
"orr x12, x12, x13, lsl 32\n\t"
|
||||
"mov v0.d[0], x8\n\t"
|
||||
"orr x14, x14, x15, lsl 32\n\t"
|
||||
"mov v0.d[1], x10\n\t"
|
||||
"orr x16, x16, x17, lsl 32\n\t"
|
||||
"mov v1.d[0], x12\n\t"
|
||||
"orr x19, x19, x20, lsl 32\n\t"
|
||||
"mov v1.d[1], x14\n\t"
|
||||
"orr x21, x21, x22, lsl 32\n\t"
|
||||
"mov v2.d[0], x16\n\t"
|
||||
"orr x23, x23, x24, lsl 32\n\t"
|
||||
"mov v2.d[1], x19\n\t"
|
||||
"mov v3.d[0], x21\n\t"
|
||||
"mov v3.d[1], x23\n\t"
|
||||
/* Add back state, XOR in message and store */
|
||||
"add v0.4s, v0.4s, v16.4s\n\t"
|
||||
"add v1.4s, v1.4s, v17.4s\n\t"
|
||||
"add v2.4s, v2.4s, v18.4s\n\t"
|
||||
"add v3.4s, v3.4s, v19.4s\n\t"
|
||||
"eor v0.16b, v0.16b, v20.16b\n\t"
|
||||
"eor v1.16b, v1.16b, v21.16b\n\t"
|
||||
"eor v2.16b, v2.16b, v22.16b\n\t"
|
||||
"eor v3.16b, v3.16b, v23.16b\n\t"
|
||||
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
|
||||
"mov v19.d[0], x7\n\t"
|
||||
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
|
||||
/* Transpose vectors */
|
||||
"trn1 v20.4s, v0.4s, v1.4s\n\t"
|
||||
"trn1 v22.4s, v2.4s, v3.4s\n\t"
|
||||
"trn2 v21.4s, v0.4s, v1.4s\n\t"
|
||||
"trn2 v23.4s, v2.4s, v3.4s\n\t"
|
||||
"trn1 v0.2d, v20.2d, v22.2d\n\t"
|
||||
"trn1 v1.2d, v21.2d, v23.2d\n\t"
|
||||
"trn2 v2.2d, v20.2d, v22.2d\n\t"
|
||||
"trn2 v3.2d, v21.2d, v23.2d\n\t"
|
||||
"trn1 v20.4s, v4.4s, v5.4s\n\t"
|
||||
"trn1 v22.4s, v6.4s, v7.4s\n\t"
|
||||
"trn2 v21.4s, v4.4s, v5.4s\n\t"
|
||||
"trn2 v23.4s, v6.4s, v7.4s\n\t"
|
||||
"trn1 v4.2d, v20.2d, v22.2d\n\t"
|
||||
"trn1 v5.2d, v21.2d, v23.2d\n\t"
|
||||
"trn2 v6.2d, v20.2d, v22.2d\n\t"
|
||||
"trn2 v7.2d, v21.2d, v23.2d\n\t"
|
||||
"trn1 v20.4s, v8.4s, v9.4s\n\t"
|
||||
"trn1 v22.4s, v10.4s, v11.4s\n\t"
|
||||
"trn2 v21.4s, v8.4s, v9.4s\n\t"
|
||||
"trn2 v23.4s, v10.4s, v11.4s\n\t"
|
||||
"trn1 v8.2d, v20.2d, v22.2d\n\t"
|
||||
"trn1 v9.2d, v21.2d, v23.2d\n\t"
|
||||
"trn2 v10.2d, v20.2d, v22.2d\n\t"
|
||||
"trn2 v11.2d, v21.2d, v23.2d\n\t"
|
||||
"trn1 v20.4s, v12.4s, v13.4s\n\t"
|
||||
"trn1 v22.4s, v14.4s, v15.4s\n\t"
|
||||
"trn2 v21.4s, v12.4s, v13.4s\n\t"
|
||||
"trn2 v23.4s, v14.4s, v15.4s\n\t"
|
||||
"trn1 v12.2d, v20.2d, v22.2d\n\t"
|
||||
"trn1 v13.2d, v21.2d, v23.2d\n\t"
|
||||
"trn2 v14.2d, v20.2d, v22.2d\n\t"
|
||||
"trn2 v15.2d, v21.2d, v23.2d\n\t"
|
||||
/* Add back state, XOR in message and store (load next block) */
|
||||
"add v20.4s, v0.4s, v16.4s\n\t"
|
||||
"add v21.4s, v4.4s, v17.4s\n\t"
|
||||
"add v22.4s, v8.4s, v18.4s\n\t"
|
||||
"add v23.4s, v12.4s, v19.4s\n\t"
|
||||
"eor v20.16b, v20.16b, v24.16b\n\t"
|
||||
"eor v21.16b, v21.16b, v25.16b\n\t"
|
||||
"eor v22.16b, v22.16b, v26.16b\n\t"
|
||||
"eor v23.16b, v23.16b, v27.16b\n\t"
|
||||
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
|
||||
"add v20.4s, v1.4s, v16.4s\n\t"
|
||||
"add v21.4s, v5.4s, v17.4s\n\t"
|
||||
"add v22.4s, v9.4s, v18.4s\n\t"
|
||||
"add v23.4s, v13.4s, v19.4s\n\t"
|
||||
"eor v20.16b, v20.16b, v24.16b\n\t"
|
||||
"eor v21.16b, v21.16b, v25.16b\n\t"
|
||||
"eor v22.16b, v22.16b, v26.16b\n\t"
|
||||
"eor v23.16b, v23.16b, v27.16b\n\t"
|
||||
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
|
||||
"add v20.4s, v2.4s, v16.4s\n\t"
|
||||
"add v21.4s, v6.4s, v17.4s\n\t"
|
||||
"add v22.4s, v10.4s, v18.4s\n\t"
|
||||
"add v23.4s, v14.4s, v19.4s\n\t"
|
||||
"eor v20.16b, v20.16b, v24.16b\n\t"
|
||||
"eor v21.16b, v21.16b, v25.16b\n\t"
|
||||
"eor v22.16b, v22.16b, v26.16b\n\t"
|
||||
"eor v23.16b, v23.16b, v27.16b\n\t"
|
||||
"ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
|
||||
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
|
||||
"add v20.4s, v3.4s, v16.4s\n\t"
|
||||
"add v21.4s, v7.4s, v17.4s\n\t"
|
||||
"add v22.4s, v11.4s, v18.4s\n\t"
|
||||
"add v23.4s, v15.4s, v19.4s\n\t"
|
||||
"eor v20.16b, v20.16b, v24.16b\n\t"
|
||||
"eor v21.16b, v21.16b, v25.16b\n\t"
|
||||
"eor v22.16b, v22.16b, v26.16b\n\t"
|
||||
"eor v23.16b, v23.16b, v27.16b\n\t"
|
||||
"st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
|
||||
"mov v29.s[0], w26\n\t"
|
||||
"sub %w[len], %w[len], #0x100\n\t"
|
||||
"add v19.4s, v19.4s, v29.4s\n\t"
|
||||
/* Done 256-byte block */
|
||||
"\n"
|
||||
"L_chacha_crypt_bytes_arm64_lt_256_%=: \n\t"
|
||||
|
||||
@@ -1980,7 +1980,7 @@ _curve25519_base:
|
||||
add x2, x2, :lo12:L_curve25519_base_x2
|
||||
#else
|
||||
adrp x2, L_curve25519_base_x2@PAGE
|
||||
add x2, x2, :lo12:L_curve25519_base_x2@PAGEOFF
|
||||
add x2, x2, L_curve25519_base_x2@PAGEOFF
|
||||
#endif /* __APPLE__ */
|
||||
ldp x6, x7, [x2]
|
||||
ldp x8, x9, [x2, #16]
|
||||
|
||||
Reference in New Issue
Block a user