From 80b7ea638ef9920fd40e5aa333d8569c05075d6b Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Fri, 5 Dec 2025 10:06:23 +1000 Subject: [PATCH] Aarch64 no harware crypto assembly AES Implementations of AES-ECB, AES-CBC, AES-CTR, AES-GCM, AES-XTS with base instructions and NEON but not using crypto instructions. Benchmark of AES-ECB added. Updated AES tests. --- .wolfssl_known_macro_extras | 1 + tests/api/test_aes.c | 309 +- tests/api/test_aes.h | 8 +- wolfcrypt/benchmark/benchmark.c | 8 +- wolfcrypt/src/aes.c | 823 +- wolfcrypt/src/port/arm/armv8-aes-asm.S | 14146 +++++++++++++++++++++ wolfcrypt/src/port/arm/armv8-aes-asm_c.c | 13281 +++++++++++++++++++ wolfssl/wolfcrypt/aes.h | 123 +- 8 files changed, 28425 insertions(+), 274 deletions(-) diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras index ca9dc8cce..52270c6eb 100644 --- a/.wolfssl_known_macro_extras +++ b/.wolfssl_known_macro_extras @@ -663,6 +663,7 @@ WOLFSSL_ALLOW_TLS_SHA1 WOLFSSL_ALTERNATIVE_DOWNGRADE WOLFSSL_ALT_NAMES_NO_REV WOLFSSL_ARM_ARCH_NEON_64BIT +WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP WOLFSSL_ASCON_UNROLL WOLFSSL_ASNC_CRYPT WOLFSSL_ASN_EXTRA diff --git a/tests/api/test_aes.c b/tests/api/test_aes.c index 97186ecfc..fd56d2658 100644 --- a/tests/api/test_aes.c +++ b/tests/api/test_aes.c @@ -289,7 +289,7 @@ int test_wc_AesEncryptDecryptDirect(void) #if !defined(NO_AES) && defined(HAVE_AES_ECB) /* Assembly code doing 8 iterations at a time. */ -#define ECB_LEN (9 * WC_AES_BLOCK_SIZE) +#define ECB_LEN (15 * WC_AES_BLOCK_SIZE) static int test_wc_AesEcbEncryptDecrypt_BadArgs(Aes* aes, byte* key, word32 keyLen) @@ -1993,7 +1993,7 @@ int test_wc_AesCtrSetKey(void) #if !defined(NO_AES) && defined(WOLFSSL_AES_COUNTER) /* Assembly code doing 8 iterations at a time. */ -#define CTR_LEN (9 * WC_AES_BLOCK_SIZE) +#define CTR_LEN (15 * WC_AES_BLOCK_SIZE) static int test_wc_AesCtrEncrypt_BadArgs(Aes* aes, byte* key, word32 keyLen, byte* iv) @@ -2237,6 +2237,18 @@ int test_wc_AesCtrEncryptDecrypt(void) 0x86, 0x8f, 0x83, 0xff, 0x3d, 0xbe, 0x6e, 0xfa, 0xd2, 0x2b, 0x3e, 0x70, 0x21, 0x1c, 0xe8, 0x7b, 0xe4, 0x01, 0x2c, 0xd0, 0x82, 0xe2, 0x7a, 0x4a, + 0xcf, 0x67, 0x82, 0x1c, 0x80, 0x79, 0x85, 0x5e, + 0xe5, 0xf9, 0x3a, 0x0d, 0x1a, 0xa7, 0x89, 0x29, + 0xee, 0xe7, 0x2b, 0xd6, 0x29, 0xac, 0xfa, 0xca, + 0xc8, 0xcb, 0x4e, 0x6c, 0x1f, 0x30, 0x5e, 0x95, + 0xa5, 0xa2, 0x17, 0xe2, 0x93, 0xd3, 0xe6, 0xbe, + 0x91, 0x37, 0x84, 0x01, 0xdb, 0x44, 0x4c, 0x60, + 0x1c, 0x2c, 0x64, 0x7d, 0xb7, 0x73, 0x12, 0x11, + 0xc2, 0x6a, 0xfd, 0xac, 0x6d, 0x85, 0xd8, 0xeb, + 0x0e, 0x70, 0xd3, 0x82, 0x93, 0x65, 0xff, 0x18, + 0x4e, 0x22, 0x07, 0x8a, 0xf6, 0xfd, 0x36, 0x9d, + 0x5c, 0x15, 0x1c, 0x84, 0x69, 0x13, 0x68, 0x78, + 0xf1, 0x04, 0x02, 0x66, 0xec, 0x37, 0xcc, 0x0d, }; #elif defined(WOLFSSL_AES_192) byte expected24[CTR_LEN] = { @@ -2258,6 +2270,18 @@ int test_wc_AesCtrEncryptDecrypt(void) 0x8d, 0x3b, 0xa9, 0x17, 0x4c, 0x2a, 0xc7, 0x97, 0x99, 0xb7, 0xaf, 0x86, 0x17, 0xf9, 0xe4, 0x2c, 0x5a, 0x4d, 0x6d, 0x7f, 0xfe, 0xb8, 0xaa, 0x9b, + 0xf8, 0xb6, 0xcb, 0x6f, 0x2f, 0xa4, 0x57, 0x61, + 0x88, 0x6c, 0x94, 0xaa, 0xf7, 0x97, 0xcf, 0xcd, + 0x19, 0x29, 0x9e, 0xf3, 0x30, 0xb8, 0xaa, 0x56, + 0x49, 0xcb, 0xf0, 0x56, 0xdd, 0xac, 0x4b, 0x41, + 0x00, 0xb3, 0x19, 0xdd, 0xef, 0x69, 0xd0, 0x9c, + 0xd1, 0x67, 0x48, 0x62, 0x9f, 0x56, 0x21, 0x2d, + 0x05, 0xb3, 0x4d, 0x0b, 0xac, 0xb6, 0x63, 0xf4, + 0x44, 0xfc, 0x43, 0xc0, 0xa9, 0x8c, 0x37, 0xd6, + 0xc3, 0x8c, 0xa4, 0x42, 0x68, 0x08, 0x2c, 0x1e, + 0xe7, 0xcc, 0xe4, 0x1f, 0x82, 0x9a, 0xe0, 0xfb, + 0x18, 0x84, 0x55, 0xaf, 0x02, 0xcc, 0x55, 0x13, + 0x7e, 0xc7, 0x05, 0xb8, 0xb9, 0x5e, 0x90, 0xc3, }; #else byte expected32[CTR_LEN] = { @@ -2279,6 +2303,18 @@ int test_wc_AesCtrEncryptDecrypt(void) 0xf1, 0x7b, 0x2b, 0x87, 0xe4, 0xcd, 0x93, 0x22, 0x07, 0xdc, 0x35, 0x46, 0x8a, 0x1d, 0xf5, 0xe4, 0x23, 0x01, 0x67, 0x00, 0x66, 0x7b, 0xd6, 0x56, + 0x0d, 0x57, 0x4f, 0x6f, 0x45, 0x82, 0x91, 0x58, + 0x81, 0x37, 0xcc, 0xb4, 0xa4, 0xa3, 0x3c, 0x57, + 0x42, 0x05, 0x95, 0xa3, 0x04, 0x1f, 0xfd, 0x32, + 0xb7, 0xc8, 0xbb, 0x14, 0xe7, 0xf1, 0xc1, 0x1f, + 0xe9, 0x33, 0x6a, 0xb0, 0x10, 0x0d, 0xfb, 0x91, + 0x88, 0xca, 0x20, 0x29, 0xeb, 0xcd, 0x9c, 0x71, + 0x07, 0xfd, 0x3f, 0x6b, 0x1f, 0xb3, 0x76, 0xb7, + 0x6b, 0xa1, 0xad, 0xbe, 0xd3, 0x45, 0xb5, 0xe9, + 0x04, 0x9a, 0xfd, 0x6a, 0x85, 0xa2, 0xbc, 0x4e, + 0xca, 0xdb, 0x84, 0xbc, 0x0e, 0x0c, 0x96, 0x65, + 0xc9, 0x95, 0x2b, 0xcb, 0x98, 0x8c, 0xd2, 0x78, + 0x85, 0x7e, 0x1a, 0xa2, 0x6a, 0x73, 0x90, 0x80, }; #endif byte iv[] = "1234567890abcdef"; @@ -3407,6 +3443,275 @@ int test_wc_AesCcmEncryptDecrypt(void) return EXPECT_RESULT(); } /* END test_wc_AesCcmEncryptDecrypt */ +/******************************************************************************* + * AES-XTS + ******************************************************************************/ + +/* + * test function for wc_AesXtsSetKey() + */ +int test_wc_AesXtsSetKey(void) +{ + EXPECT_DECLS; +#if !defined(NO_AES) && defined(WOLFSSL_AES_XTS) + XtsAes aes; +#ifdef WOLFSSL_AES_128 + byte key16[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + }; +#endif +#if defined(WOLFSSL_AES_192) && !defined(HAVE_FIPS) + byte key24[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 + }; +#endif +#ifdef WOLFSSL_AES_256 + byte key32[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 + }; +#endif + byte badKey16[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65 + }; + byte badKey24[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36 + }; + byte badKey32[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x37, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65 + }; + byte* key; + word32 keyLen; + +#ifdef WOLFSSL_AES_128 + key = key16; + keyLen = sizeof(key16)/sizeof(byte); +#elif defined(WOLFSSL_AES_192) + key = key24; + keyLen = sizeof(key24)/sizeof(byte); +#else + key = key32; + keyLen = sizeof(key32)/sizeof(byte); +#endif + +#ifdef WOLFSSL_AES_128 + ExpectIntEQ(wc_AesXtsSetKey(&aes, key16, sizeof(key16)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + wc_AesXtsFree(&aes); +#endif +#if defined(WOLFSSL_AES_192) && !defined(HAVE_FIPS) + ExpectIntEQ(wc_AesXtsSetKey(&aes, key24, sizeof(key24)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + wc_AesXtsFree(&aes); +#endif +#ifdef WOLFSSL_AES_256 + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + wc_AesXtsFree(&aes); +#endif + + /* Pass in bad args. */ + ExpectIntEQ(wc_AesXtsSetKey(NULL, NULL, keyLen, AES_ENCRYPTION, NULL, + INVALID_DEVID), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsSetKey(NULL, key, keyLen, AES_ENCRYPTION, NULL, + INVALID_DEVID), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsSetKey(&aes, NULL, keyLen, AES_ENCRYPTION, NULL, + INVALID_DEVID), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsSetKey(&aes, badKey16, sizeof(badKey16)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), WC_NO_ERR_TRACE(WC_KEY_SIZE_E)); + ExpectIntEQ(wc_AesXtsSetKey(&aes, badKey24, sizeof(badKey24)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), WC_NO_ERR_TRACE(WC_KEY_SIZE_E)); + ExpectIntEQ(wc_AesXtsSetKey(&aes, badKey32, sizeof(badKey32)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), WC_NO_ERR_TRACE(WC_KEY_SIZE_E)); + ExpectIntEQ(wc_AesXtsSetKey(&aes, key, keyLen, -2, NULL, INVALID_DEVID), + WC_NO_ERR_TRACE(BAD_FUNC_ARG)); +#endif + return EXPECT_RESULT(); +} /* END test_wc_AesXtsSetKey */ + +int test_wc_AesXtsEncryptDecrypt_Sizes(void) +{ + EXPECT_DECLS; +#if !defined(NO_AES) && defined(WOLFSSL_AES_XTS) && \ + defined(WOLFSSL_AES_256) && !defined(WOLFSSL_AFALG) && \ + !defined(WOLFSSL_KCAPI) + #define XTS_LEN (WC_AES_BLOCK_SIZE * 16) + byte key32[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 + }; + byte tweak[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + }; + XtsAes aes; + word32 tweakLen = (word32)sizeof(tweak)/sizeof(byte); + int sz; + WC_DECLARE_VAR(plain, byte, XTS_LEN, NULL); + WC_DECLARE_VAR(cipher, byte, XTS_LEN, NULL); +#ifdef HAVE_AES_DECRYPT + WC_DECLARE_VAR(decrypted, byte, XTS_LEN, NULL); +#endif + + WC_ALLOC_VAR(plain, byte, XTS_LEN, NULL); + WC_ALLOC_VAR(cipher, byte, XTS_LEN, NULL); +#ifdef HAVE_AES_DECRYPT + WC_ALLOC_VAR(decrypted, byte, XTS_LEN, NULL); +#endif + +#ifdef WC_DECLARE_VAR_IS_HEAP_ALLOC + ExpectNotNull(plain); + ExpectNotNull(cipher); +#ifdef HAVE_AES_DECRYPT + ExpectNotNull(decrypted); +#endif +#endif + + XMEMSET(&aes, 0, sizeof(Aes)); + XMEMSET(plain, 0xa5, XTS_LEN); + + for (sz = WC_AES_BLOCK_SIZE; sz <= XTS_LEN; sz *= 2) { + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + XMEMSET(cipher, 0, XTS_LEN); + ExpectIntEQ(wc_AesXtsEncrypt(&aes, cipher, plain, sz, tweak, tweakLen), + 0); + wc_AesXtsFree(&aes); + +#ifdef HAVE_AES_DECRYPT + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_DECRYPTION, NULL, INVALID_DEVID), 0); + XMEMSET(decrypted, 0xff, XTS_LEN); + ExpectIntEQ(wc_AesXtsDecrypt(&aes, decrypted, cipher, sz, tweak, + tweakLen), 0); + ExpectBufEQ(decrypted, plain, sz); + wc_AesXtsFree(&aes); +#endif + } + + WC_FREE_VAR(plain, NULL); + WC_FREE_VAR(cipher, NULL); +#ifdef HAVE_AES_DECRYPT + WC_FREE_VAR(decrypted, NULL); +#endif +#endif + return EXPECT_RESULT(); +} + +/* + * test function for wc_AesXtsEncrypt and wc_AesXtsDecrypt + */ +int test_wc_AesXtsEncryptDecrypt(void) +{ + EXPECT_DECLS; +#if !defined(NO_AES) && defined(WOLFSSL_AES_XTS) && \ + defined(WOLFSSL_AES_256) + XtsAes aes; + byte key32[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66 + }; + byte vector[] = { /* Now is the time for all w/o trailing 0 */ + 0x4e,0x6f,0x77,0x20,0x69,0x73,0x20,0x74, + 0x68,0x65,0x20,0x74,0x69,0x6d,0x65,0x20, + 0x66,0x6f,0x72,0x20,0x61,0x6c,0x6c,0x20 + }; + byte tweak[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + }; + word32 tweakLen = (word32)sizeof(tweak)/sizeof(byte); + byte enc[sizeof(vector)]; + byte resultT[WC_AES_BLOCK_SIZE]; + byte dec[sizeof(vector)]; + + /* Init stack variables. */ + XMEMSET(&aes, 0, sizeof(Aes)); + XMEMSET(enc, 0, sizeof(vector)); + XMEMSET(dec, 0, sizeof(vector)); + XMEMSET(resultT, 0, WC_AES_BLOCK_SIZE); + + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + ExpectIntEQ(wc_AesXtsEncrypt(&aes, enc, vector, sizeof(vector), tweak, + tweakLen), 0); + wc_AesXtsFree(&aes); + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_DECRYPTION, NULL, INVALID_DEVID), 0); + ExpectIntEQ(wc_AesXtsDecrypt(&aes, dec, enc, sizeof(vector), tweak, + tweakLen), 0); + ExpectIntEQ(XMEMCMP(vector, dec, sizeof(vector)), 0); + wc_AesXtsFree(&aes); + + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_ENCRYPTION, NULL, INVALID_DEVID), 0); + /* Test bad args for wc_AesXtsEncrypt and wc_AesXtsDecrypt */ + ExpectIntEQ(wc_AesXtsEncrypt(NULL, enc, vector, sizeof(vector), tweak, + tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsEncrypt(&aes, NULL, vector, sizeof(vector), tweak, + tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsEncrypt(&aes, enc, NULL, sizeof(vector), tweak, + tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + wc_AesXtsFree(&aes); + /* END wc_AesXtsEncrypt */ + +#ifdef HAVE_AES_DECRYPT + ExpectIntEQ(wc_AesXtsSetKey(&aes, key32, sizeof(key32)/sizeof(byte), + AES_DECRYPTION, NULL, INVALID_DEVID), 0); + ExpectIntEQ(wc_AesXtsDecrypt(NULL, dec, enc, sizeof(enc)/sizeof(byte), + tweak, tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsDecrypt(&aes, NULL, enc, sizeof(enc)/sizeof(byte), + tweak, tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + ExpectIntEQ(wc_AesXtsDecrypt(&aes, dec, NULL, sizeof(enc)/sizeof(byte), + tweak, tweakLen), WC_NO_ERR_TRACE(BAD_FUNC_ARG)); + wc_AesXtsFree(&aes); +#endif /* HAVE_AES_DECRYPT */ +#endif + + return EXPECT_RESULT(); +} /* END test_wc_AesXtsEncryptDecrypt */ + #if defined(WOLFSSL_AES_EAX) && defined(WOLFSSL_AES_256) && \ (!defined(HAVE_FIPS) || FIPS_VERSION_GE(5, 3)) && !defined(HAVE_SELFTEST) diff --git a/tests/api/test_aes.h b/tests/api/test_aes.h index cdb400ed1..99265f333 100644 --- a/tests/api/test_aes.h +++ b/tests/api/test_aes.h @@ -41,6 +41,9 @@ int test_wc_AesGcmMixedEncDecLongIV(void); int test_wc_AesGcmStream(void); int test_wc_AesCcmSetKey(void); int test_wc_AesCcmEncryptDecrypt(void); +int test_wc_AesXtsSetKey(void); +int test_wc_AesXtsEncryptDecrypt_Sizes(void); +int test_wc_AesXtsEncryptDecrypt(void); #if defined(WOLFSSL_AES_EAX) && defined(WOLFSSL_AES_256) && \ (!defined(HAVE_FIPS) || FIPS_VERSION_GE(5, 3)) && !defined(HAVE_SELFTEST) int test_wc_AesEaxVectors(void); @@ -68,7 +71,10 @@ int test_wc_GmacUpdate(void); TEST_DECL_GROUP("aes", test_wc_AesGcmMixedEncDecLongIV), \ TEST_DECL_GROUP("aes", test_wc_AesGcmStream), \ TEST_DECL_GROUP("aes", test_wc_AesCcmSetKey), \ - TEST_DECL_GROUP("aes", test_wc_AesCcmEncryptDecrypt) + TEST_DECL_GROUP("aes", test_wc_AesCcmEncryptDecrypt), \ + TEST_DECL_GROUP("aes", test_wc_AesXtsSetKey), \ + TEST_DECL_GROUP("aes", test_wc_AesXtsEncryptDecrypt_Sizes), \ + TEST_DECL_GROUP("aes", test_wc_AesXtsEncryptDecrypt) #if defined(WOLFSSL_AES_EAX) && defined(WOLFSSL_AES_256) && \ (!defined(HAVE_FIPS) || FIPS_VERSION_GE(5, 3)) && !defined(HAVE_SELFTEST) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index 797e64e6b..0caad57d9 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -1038,7 +1038,7 @@ static const bench_alg bench_cipher_opt[] = { #ifdef HAVE_AESGCM { "-aes-gmac", BENCH_AES_GMAC }, #endif -#ifdef WOLFSSL_AES_DIRECT +#if defined(HAVE_AES_ECB) || (defined(HAVE_FIPS) && defined(WOLFSSL_AES_DIRECT)) { "-aes-ecb", BENCH_AES_ECB }, #endif #ifdef WOLFSSL_AES_XTS @@ -3844,7 +3844,7 @@ static void* benchmarks_do(void* args) #endif } #endif -#ifdef HAVE_AES_ECB +#if defined(HAVE_AES_ECB) || (defined(HAVE_FIPS) && defined(WOLFSSL_AES_DIRECT)) if (bench_all || (bench_cipher_algs & BENCH_AES_ECB)) { #ifndef NO_SW_BENCH bench_aesecb(0); @@ -5604,7 +5604,7 @@ void bench_gmac(int useDeviceID) #endif /* HAVE_AESGCM */ -#ifdef HAVE_AES_ECB +#if defined(HAVE_AES_ECB) || (defined(HAVE_FIPS) && defined(WOLFSSL_AES_DIRECT)) static void bench_aesecb_internal(int useDeviceID, const byte* key, word32 keySz, const char* encLabel, const char* decLabel) @@ -5773,7 +5773,7 @@ void bench_aesecb(int useDeviceID) "AES-256-ECB-enc", "AES-256-ECB-dec"); #endif } -#endif /* HAVE_AES_ECB */ +#endif /* HAVE_AES_ECB || (HAVE_FIPS && WOLFSSL_AES_DIRECT) */ #ifdef WOLFSSL_AES_CFB static void bench_aescfb_internal(const byte* key, diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 09a96762f..0dd3658c9 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -806,54 +806,95 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits } #endif /* HAVE_AES_DECRYPT */ -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#elif defined(WOLFSSL_ARMASM) +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +static cpuid_flags_t cpuid_flags = WC_CPUID_INITIALIZER; - #define NEED_AES_TABLES +static void Check_CPU_support_HwCrypto(Aes* aes) +{ + cpuid_get_flags_ex(&cpuid_flags); + aes->use_aes_hw_crypto = IS_AARCH64_AES(cpuid_flags); +#ifdef HAVE_AESGCM + aes->use_pmull_hw_crypto = IS_AARCH64_PMULL(cpuid_flags); + aes->use_sha3_hw_crypto = IS_AARCH64_SHA3(cpuid_flags); +#endif +} +#endif /* __aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ - static cpuid_flags_t cpuid_flags = WC_CPUID_INITIALIZER; - - static void Check_CPU_support_HwCrypto(Aes* aes) - { - cpuid_get_flags_ex(&cpuid_flags); - aes->use_aes_hw_crypto = IS_AARCH64_AES(cpuid_flags); - #ifdef HAVE_AESGCM - aes->use_pmull_hw_crypto = IS_AARCH64_PMULL(cpuid_flags); - aes->use_sha3_hw_crypto = IS_AARCH64_SHA3(cpuid_flags); - #endif - } - -#elif !defined(__aarch64__) && defined(WOLFSSL_ARMASM) - -#if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESCCM) +#if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESCCM) || \ + defined(WOLFSSL_AESGCM_STREAM) static WARN_UNUSED_RESULT int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock) { #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#if !defined(__aarch64__) AES_encrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); #else - AES_ECB_encrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, (byte*)aes->key, - (int)aes->rounds); + if (aes->use_aes_hw_crypto) { + AES_encrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, + (int)aes->rounds); + } + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (0) #endif + { + AES_ECB_encrypt_NEON(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_ECB_encrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, (byte*)aes->key, + (int)aes->rounds); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; } #endif -#ifdef HAVE_AES_DECRYPT -#ifdef WOLFSSL_AES_DIRECT +#if defined(HAVE_AES_DECRYPT) && defined(WOLFSSL_AES_DIRECT) static WARN_UNUSED_RESULT int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock) { #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#if !defined(__aarch64__) AES_decrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); #else - AES_ECB_decrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, (byte*)aes->key, - (int)aes->rounds); + if (aes->use_aes_hw_crypto) { + AES_decrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, + (int)aes->rounds); + } + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (0) #endif + { + AES_ECB_decrypt_NEON(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (byte*)aes->key, (int)aes->rounds); + } + else +#endif +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_ECB_decrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, (byte*)aes->key, + (int)aes->rounds); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; } -#endif -#endif +#endif /* HAVE_AES_DECRYPT && WOLFSSL_AES_DIRECT */ #elif defined(FREESCALE_MMCAU) /* Freescale mmCAU hardware AES support for Direct, CBC, CCM, GCM modes @@ -1169,8 +1210,7 @@ static const FLASH_QUALIFIER word32 rcon[] = { #endif /* ESP32 */ #endif /* __aarch64__ || !WOLFSSL_ARMASM */ -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) || defined(WOLFSSL_AES_DIRECT) || \ +#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_AES_DIRECT) || \ defined(HAVE_AESCCM) #ifndef WOLFSSL_AES_SMALL_TABLES static const FLASH_QUALIFIER word32 Te[4][256] = { @@ -1824,8 +1864,7 @@ static WARN_UNUSED_RESULT word32 inv_col_mul( #if defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT) || \ defined(HAVE_AESCCM) || defined(HAVE_AESGCM) -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) || defined(WOLFSSL_AES_DIRECT) || \ +#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_AES_DIRECT) || \ defined(HAVE_AESCCM) @@ -3042,20 +3081,36 @@ static WARN_UNUSED_RESULT int wc_AesEncrypt( printf("Skipping AES-NI\n"); #endif } -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#elif defined(WOLFSSL_ARMASM) +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#if !defined(__aarch64__) + AES_encrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); +#else if (aes->use_aes_hw_crypto) { AES_encrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); - return 0; } -#elif !defined(__aarch64__) && defined(WOLFSSL_ARMASM) -#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO - AES_encrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) #else - AES_ECB_encrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds); + if (0) #endif + { + AES_ECB_encrypt_NEON(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_ECB_encrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; #endif /* WOLFSSL_AESNI */ #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES) @@ -3815,20 +3870,36 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt( printf("Skipping AES-NI\n"); #endif } -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#elif defined(WOLFSSL_ARMASM) +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#if !defined(__aarch64__) + AES_decrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); +#else if (aes->use_aes_hw_crypto) { AES_decrypt_AARCH64(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); - return 0; } -#elif !defined(__aarch64__) && defined(WOLFSSL_ARMASM) -#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO - AES_decrypt_AARCH32(inBlock, outBlock, (byte*)aes->key, (int)aes->rounds); + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) #else - AES_ECB_decrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds); + if (0) #endif + { + AES_ECB_decrypt_NEON(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_ECB_decrypt(inBlock, outBlock, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; #endif /* WOLFSSL_AESNI */ #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES) @@ -4455,8 +4526,7 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt( #ifdef NEED_AES_TABLES #ifndef WC_AES_BITSLICED -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if !defined(WOLFSSL_ARMASM) /* Set the AES key and expand. * * @param [in] aes AES object. @@ -4940,14 +5010,47 @@ static void AesSetKey_C(Aes* aes, const byte* key, word32 keySz, int dir) } #endif /* WOLFSSL_AESNI */ - #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + #ifndef __aarch64__ + AES_set_key_AARCH32(userKey, keylen, (byte*)aes->key, dir); + #else Check_CPU_support_HwCrypto(aes); if (aes->use_aes_hw_crypto) { AES_set_key_AARCH64(userKey, keylen, (byte*)aes->key, dir); - return 0; } - #endif + else + #endif /* __aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + #if !defined(WOLFSSL_ARMASM_NO_NEON) + if (1) { + AES_set_encrypt_key_NEON(userKey, keylen * 8, (byte*)aes->key); + #ifdef HAVE_AES_DECRYPT + if (dir == AES_DECRYPTION) { + AES_invert_key_NEON((byte*)aes->key, aes->rounds); + } + #else + (void)dir; + #endif + } + else + #endif /* !WOLFSSL_ARMASM_NO_NEON */ +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_set_encrypt_key(userKey, keylen * 8, (byte*)aes->key); + #ifdef HAVE_AES_DECRYPT + if (dir == AES_DECRYPTION) { + AES_invert_key((byte*)aes->key, aes->rounds); + } + #else + (void)dir; + #endif + } + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ + return 0; +#endif /* WOLFSSL_ARMASM */ #ifdef WOLFSSL_KCAPI_AES XMEMCPY(aes->devKey, userKey, keylen); @@ -5171,8 +5274,8 @@ int wc_AesSetIV(Aes* aes, const byte* iv) #else /* !WOLFSSL_AESNI */ -#define VECTOR_REGISTERS_PUSH { WC_DO_NOTHING -#define VECTOR_REGISTERS_POP } WC_DO_NOTHING +#define VECTOR_REGISTERS_PUSH WC_DO_NOTHING +#define VECTOR_REGISTERS_POP WC_DO_NOTHING #endif /* !WOLFSSL_AESNI */ @@ -6171,7 +6274,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv) int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) { -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM) word32 blocks; int ret; #endif @@ -6184,7 +6287,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) return 0; } -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM) blocks = sz / WC_AES_BLOCK_SIZE; #endif #ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS @@ -6234,14 +6337,37 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } #endif /* WOLFSSL_ASYNC_CRYPT */ -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + #if !defined(__aarch64__) AES_CBC_encrypt_AARCH32(in, out, sz, (byte*)aes->reg, (byte*)aes->key, (int)aes->rounds); -#else - AES_CBC_encrypt(in, out, sz, (const unsigned char*)aes->key, - aes->rounds, (unsigned char*)aes->reg); -#endif + #else + if (aes->use_aes_hw_crypto) { + AES_CBC_encrypt_AARCH64(in, out, sz, (byte*)aes->reg, + (byte*)aes->key, (int)aes->rounds); + } + else + #endif /* __aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (0) + #endif + { + AES_CBC_encrypt_NEON(in, out, sz, (const unsigned char*)aes->key, + aes->rounds, (unsigned char*)aes->reg); + } + else + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_CBC_encrypt(in, out, sz, (const unsigned char*)aes->key, + aes->rounds, (unsigned char*)aes->reg); + } + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; #else #if defined(WOLFSSL_SE050) && defined(WOLFSSL_SE050_CRYPT) @@ -6312,14 +6438,6 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } } else - #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { - AES_CBC_encrypt_AARCH64(in, out, sz, (byte*)aes->reg, - (byte*)aes->key, (int)aes->rounds); - ret = 0; - } - else #endif { ret = 0; @@ -6347,7 +6465,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) /* Software AES - CBC Decrypt */ int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz) { -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM) word32 blocks; int ret; #endif @@ -6375,7 +6493,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } #endif -#if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM) blocks = sz / WC_AES_BLOCK_SIZE; #endif if (sz % WC_AES_BLOCK_SIZE) { @@ -6434,14 +6552,37 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } #endif -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + #if !defined(__aarch64__) AES_CBC_decrypt_AARCH32(in, out, sz, (byte*)aes->reg, (byte*)aes->key, (int)aes->rounds); -#else - AES_CBC_decrypt(in, out, sz, (const unsigned char*)aes->key, - aes->rounds, (unsigned char*)aes->reg); -#endif + #else + if (aes->use_aes_hw_crypto) { + AES_CBC_decrypt_AARCH64(in, out, sz, (byte*)aes->reg, + (byte*)aes->key, (int)aes->rounds); + } + else + #endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (sz >= 64) + #endif + { + AES_CBC_decrypt_NEON(in, out, sz, (const unsigned char*)aes->key, + aes->rounds, (unsigned char*)aes->reg); + } + else + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_CBC_decrypt(in, out, sz, (const unsigned char*)aes->key, + aes->rounds, (unsigned char*)aes->reg); + } + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; #else VECTOR_REGISTERS_PUSH; @@ -6475,14 +6616,6 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) ret = 0; } else - #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { - AES_CBC_decrypt_AARCH64(in, out, sz, (byte*)aes->reg, - (byte*)aes->key, (int)aes->rounds); - ret = 0; - } - else #endif { ret = 0; @@ -6777,8 +6910,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) #endif #ifdef NEED_AES_CTR_SOFT - #if !(!defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) + #ifndef WOLFSSL_ARMASM /* Increment AES counter */ static WC_INLINE void IncrementAesCounter(byte* inOutCtr) { @@ -6789,7 +6921,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) return; } } - #endif + #endif /* Software AES - CTR Encrypt */ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) @@ -6798,7 +6930,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) byte scratch[WC_AES_BLOCK_SIZE]; #endif - #if defined(__aarch64__) || !defined(WOLFSSL_ARMASM) + #if !defined(WOLFSSL_ARMASM) int ret = 0; #endif word32 processed; @@ -6833,11 +6965,21 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) aes->left -= processed; sz -= processed; - #if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) - #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + #if defined(WOLFSSL_ARMASM) + #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + #ifndef __aarch64__ AES_CTR_encrypt_AARCH32(in, out, sz, (byte*)aes->reg, (byte*)aes->key, (byte*)aes->tmp, &aes->left, aes->rounds); - #else + #else + if (aes->use_aes_hw_crypto) { + AES_CTR_encrypt_AARCH64(in, out, sz, (byte*)aes->reg, + (byte*)aes->key, (byte*)aes->tmp, &aes->left, aes->rounds); + return 0; + } + else + #endif /* !__aarch64__ */ + #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ + #if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) { word32 numBlocks; byte* tmp = (byte*)aes->tmp + WC_AES_BLOCK_SIZE - aes->left; @@ -6851,8 +6993,23 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) /* do as many block size ops as possible */ numBlocks = sz / WC_AES_BLOCK_SIZE; if (numBlocks > 0) { - AES_CTR_encrypt(in, out, numBlocks * WC_AES_BLOCK_SIZE, - (byte*)aes->key, aes->rounds, (byte*)aes->reg); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (sz >= 32) + #endif + { + AES_CTR_encrypt_NEON(in, out, + numBlocks * WC_AES_BLOCK_SIZE, (byte*)aes->key, + aes->rounds, (byte*)aes->reg); + } + else + #endif + { + AES_CTR_encrypt(in, out, numBlocks * WC_AES_BLOCK_SIZE, + (byte*)aes->key, aes->rounds, (byte*)aes->reg); + } sz -= numBlocks * WC_AES_BLOCK_SIZE; out += numBlocks * WC_AES_BLOCK_SIZE; @@ -6864,8 +7021,24 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) byte zeros[WC_AES_BLOCK_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - AES_CTR_encrypt(zeros, (byte*)aes->tmp, WC_AES_BLOCK_SIZE, - (byte*)aes->key, aes->rounds, (byte*)aes->reg); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (0) + #endif + { + AES_CTR_encrypt_NEON(zeros, (byte*)aes->tmp, + WC_AES_BLOCK_SIZE, (byte*)aes->key, aes->rounds, + (byte*)aes->reg); + } + else + #endif + { + AES_CTR_encrypt(zeros, (byte*)aes->tmp, + WC_AES_BLOCK_SIZE, (byte*)aes->key, aes->rounds, + (byte*)aes->reg); + } aes->left = WC_AES_BLOCK_SIZE; tmp = (byte*)aes->tmp; @@ -6876,18 +7049,9 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz) } } } - #endif + #endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ return 0; #else - #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { - AES_CTR_encrypt_AARCH64(in, out, sz, (byte*)aes->reg, - (byte*)aes->key, (byte*)aes->tmp, &aes->left, aes->rounds); - return 0; - } - #endif - VECTOR_REGISTERS_PUSH; #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \ @@ -7041,6 +7205,8 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #else /* software + AESNI implementation */ #if !defined(FREESCALE_LTC_AES_GCM) +#if (!(defined(__aarch64__) && defined(WOLFSSL_ARMASM))) || \ + defined(WOLFSSL_AESGCM_STREAM) static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) { int i; @@ -7051,6 +7217,7 @@ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) return; } } +#endif #endif /* !FREESCALE_LTC_AES_GCM */ #if !defined(WOLFSSL_ARMASM) || defined(__aarch64__) || \ @@ -7177,22 +7344,28 @@ void GenerateM0(Gcm* gcm) XMEMCPY(m[0xf], m[0x8], WC_AES_BLOCK_SIZE); xorbuf (m[0xf], m[0x7], WC_AES_BLOCK_SIZE); -#if defined(WOLFSSL_ARMASM) && !defined(__aarch64__) && \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - for (i = 0; i < 16; i++) { - word32* m32 = (word32*)gcm->M0[i]; - m32[0] = ByteReverseWord32(m32[0]); - m32[1] = ByteReverseWord32(m32[1]); - m32[2] = ByteReverseWord32(m32[2]); - m32[3] = ByteReverseWord32(m32[3]); - } -#endif - #if !defined(WC_16BIT_CPU) for (i = 0; i < 16; i++) { Shift4_M0(m[16+i], m[i]); } #endif + +#if defined(WOLFSSL_ARMASM) && defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + for (i = 0; i < 32; i++) { + #if !defined(__aarch64__) + word32* m32 = (word32*)gcm->M0[i]; + m32[0] = ByteReverseWord32(m32[0]); + m32[1] = ByteReverseWord32(m32[1]); + m32[2] = ByteReverseWord32(m32[2]); + m32[3] = ByteReverseWord32(m32[3]); + #else + word64* m64 = (word64*)gcm->M0[i]; + m64[0] = ByteReverseWord64(m64[0]); + m64[1] = ByteReverseWord64(m64[1]); + #endif + } +#endif + } #endif /* GCM_TABLE */ @@ -7269,26 +7442,42 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) return ret; #endif /* WOLFSSL_RENESAS_RSIP && WOLFSSL_RENESAS_FSPSM_CRYPTONLY*/ -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) if (ret == 0) { - #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO + #if !defined(__aarch64__) AES_GCM_set_key_AARCH32(iv, (byte*)aes->key, aes->gcm.H, aes->rounds); #else - AES_ECB_encrypt(iv, aes->gcm.H, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds); + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + AES_GCM_set_key_AARCH64(iv, (byte*)aes->key, aes->gcm.H, + aes->rounds); + } + else + #endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (0) + #endif + { + AES_ECB_encrypt_NEON(iv, aes->gcm.H, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + AES_ECB_encrypt(iv, aes->gcm.H, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) GenerateM0(&aes->gcm); #endif /* GCM_TABLE */ - #endif + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ } #else -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (ret == 0 && aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { - AES_GCM_set_key_AARCH64(iv, (byte*)aes->key, aes->gcm.H, aes->rounds); - } - else -#endif #if !defined(FREESCALE_LTC_AES_GCM) && !defined(WOLFSSL_PSOC6_CRYPTO) if (ret == 0) { VECTOR_REGISTERS_PUSH; @@ -7510,7 +7699,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, while (0) #endif /* WOLFSSL_AESGCM_STREAM */ -#if defined(WOLFSSL_ARMASM) && !defined(__aarch64__) +#if defined(WOLFSSL_ARMASM) && !defined(__aarch64__) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) static void GCM_gmult_len_armasm_C( byte* x, const byte* h, const unsigned char* a, unsigned long len) { @@ -7541,14 +7731,30 @@ static void GCM_gmult_len_armasm_C( #define GCM_GMULT_LEN(gcm, x, a, len) \ GCM_gmult_len_armasm_C(x, (gcm)->H, a, len) -#endif /* WOLFSSL_ARMASM && !__aarch64__ */ +#endif /* WOLFSSL_ARMASM && !__aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ + +#if defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || \ + defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) +#if !defined(WOLFSSL_ARMASM_NO_NEON) && defined(__aarch64__) +#define GCM_GMULT_LEN(gcm, x, a, len) \ + GCM_gmult_len_NEON(x, (const byte*)((gcm)->H), a, len) +#else +#define GCM_GMULT_LEN(gcm, x, a, len) \ + GCM_gmult_len(x, (const byte**)((gcm)->M0), a, len) +#endif +#endif #elif defined(GCM_TABLE) -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || \ + defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) +#if !defined(WOLFSSL_ARMASM_NO_NEON) && defined(__aarch64__) +#define GCM_GMULT_LEN(gcm, x, a, len) \ + GCM_gmult_len_NEON(x, (const byte*)((gcm)->H), a, len) +#else #define GCM_GMULT_LEN(gcm, x, a, len) \ GCM_gmult_len(x, (const byte**)((gcm)->M0), a, len) +#endif #else ALIGN16 static const byte R[256][2] = { {0x00, 0x00}, {0x01, 0xc2}, {0x03, 0x84}, {0x02, 0x46}, @@ -7812,10 +8018,19 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, /* end GCM_TABLE */ #elif defined(GCM_TABLE_4BIT) -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || \ + defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)) +#if !defined(WOLFSSL_ARMASM_NO_NEON) && defined(__aarch64__) +#define GCM_GMULT_LEN(gcm, x, a, len) \ + GCM_gmult_len_NEON(x, (const byte*)((gcm)->H), a, len) +#define GMULT(x, m) \ + GCM_gmult_NEON(x, (const byte**)m) +#else #define GCM_GMULT_LEN(gcm, x, a, len) \ GCM_gmult_len(x, (const byte**)((gcm)->M0), a, len) +#define GMULT(x, m) \ + GCM_gmult(x, (const byte**)m) +#endif #else /* remainder = x^7 + x^2 + x^1 + 1 => 0xe1 * R shifts right a reverse bit pair of bytes such that: @@ -8270,7 +8485,16 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, */ #define GHASH_INIT_EXTRA(aes) WC_DO_NOTHING -/* GHASH one block of data.. +#ifdef GCM_GMULT_LEN +/* GHASH one block of data. + * + * @param [in, out] aes AES GCM object. + * @param [in] block Block of AAD or cipher text. + */ +#define GHASH_ONE_BLOCK_SW(aes, block) \ + GCM_GMULT_LEN(&(aes)->gcm, AES_TAG(aes), block, WC_AES_BLOCK_SIZE) +#else +/* GHASH one block of data. * * XOR block into tag and GMULT with H using pre-computed table. * @@ -8283,6 +8507,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, GMULT(AES_TAG(aes), (aes)->gcm.M0); \ } \ while (0) +#endif #endif /* WOLFSSL_AESGCM_STREAM */ #elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) @@ -9320,7 +9545,7 @@ static WARN_UNUSED_RESULT int wc_AesGcmEncrypt_STM32( #endif /* STM32_CRYPTO_AES_GCM */ -#if !defined(WOLFSSL_ARMASM) || defined(__aarch64__) +#if !defined(WOLFSSL_ARMASM) #ifdef WOLFSSL_AESNI /* For performance reasons, this code needs to be not inlined. */ WARN_UNUSED_RESULT int AES_GCM_encrypt_C( @@ -9435,8 +9660,8 @@ WARN_UNUSED_RESULT int AES_GCM_encrypt_C( return ret; } -#elif defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) -static int AES_GCM_encrypt_AARCH32(Aes* aes, byte* out, const byte* in, +#elif defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +static int AES_GCM_encrypt_ARM(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -9477,16 +9702,44 @@ static int AES_GCM_encrypt_AARCH32(Aes* aes, byte* out, const byte* in, blocks = sz / WC_AES_BLOCK_SIZE; partial = sz % WC_AES_BLOCK_SIZE; if (blocks > 0) { - AES_GCM_encrypt(in, out, blocks * WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds, counter); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (sz >= 32) + #endif + { + AES_GCM_encrypt_NEON(in, out, blocks * WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } + else + #endif + { + AES_GCM_encrypt(in, out, blocks * WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } GCM_GMULT_LEN(&aes->gcm, x, out, blocks * WC_AES_BLOCK_SIZE); in += blocks * WC_AES_BLOCK_SIZE; out += blocks * WC_AES_BLOCK_SIZE; } /* take care of partial block sizes leftover */ if (partial != 0) { - AES_GCM_encrypt(in, scratch, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds, counter); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (0) + #endif + { + AES_GCM_encrypt_NEON(in, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } + else + #endif + { + AES_GCM_encrypt(in, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } XMEMCPY(out, scratch, partial); XMEMSET(scratch, 0, WC_AES_BLOCK_SIZE); @@ -9508,8 +9761,22 @@ static int AES_GCM_encrypt_AARCH32(Aes* aes, byte* out, const byte* in, } /* Auth tag calculation. */ - AES_ECB_encrypt(initialCounter, scratch, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds); +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (0) +#endif + { + AES_ECB_encrypt_NEON(initialCounter, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif + { + AES_ECB_encrypt(initialCounter, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } xorbuf(authTag, scratch, authTagSz); return 0; @@ -9608,16 +9875,39 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, VECTOR_REGISTERS_PUSH; -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#if !defined(__aarch64__) AES_GCM_encrypt_AARCH32(in, out, sz, iv, ivSz, authTag, authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); ret = 0; #else - ret = AES_GCM_encrypt_AARCH32(aes, out, in, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz); -#endif + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + AES_GCM_encrypt_AARCH64_EOR3(in, out, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, + (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); + } + else + #endif + { + AES_GCM_encrypt_AARCH64(in, out, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz, (byte*)aes->key, aes->gcm.H, (byte*)aes->tmp, + (byte*)aes->reg, aes->rounds); + } + ret = 0; + } + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + ret = AES_GCM_encrypt_ARM(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { @@ -9643,25 +9933,6 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, } } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { - #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - if (aes->use_sha3_hw_crypto) { - AES_GCM_encrypt_AARCH64_EOR3(in, out, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, - (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); - } - else - #endif - { - AES_GCM_encrypt_AARCH64(in, out, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz, (byte*)aes->key, aes->gcm.H, (byte*)aes->tmp, - (byte*)aes->reg, aes->rounds); - } - ret = 0; - } - else #endif /* WOLFSSL_AESNI */ { ret = AES_GCM_encrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, @@ -9977,7 +10248,7 @@ static WARN_UNUSED_RESULT int wc_AesGcmDecrypt_STM32( #endif /* STM32_CRYPTO_AES_GCM */ -#if !defined(WOLFSSL_ARMASM) || defined(__aarch64__) +#if !defined(WOLFSSL_ARMASM) #ifdef WOLFSSL_AESNI /* For performance reasons, this code needs to be not inlined. */ int WARN_UNUSED_RESULT AES_GCM_decrypt_C( @@ -10125,8 +10396,8 @@ int WARN_UNUSED_RESULT AES_GCM_decrypt_C( #endif return ret; } -#elif defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) -static int AES_GCM_decrypt_AARCH32(Aes* aes, byte* out, const byte* in, +#elif defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +static int AES_GCM_decrypt_ARM(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -10169,8 +10440,22 @@ static int AES_GCM_decrypt_AARCH32(Aes* aes, byte* out, const byte* in, if (blocks > 0) { GCM_GMULT_LEN(&aes->gcm, x, in, blocks * WC_AES_BLOCK_SIZE); - AES_GCM_encrypt(in, out, blocks * WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds, counter); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (sz >= 32) + #endif + { + AES_GCM_encrypt_NEON(in, out, blocks * WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } + else + #endif + { + AES_GCM_encrypt(in, out, blocks * WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } in += blocks * WC_AES_BLOCK_SIZE; out += blocks * WC_AES_BLOCK_SIZE; } @@ -10179,8 +10464,22 @@ static int AES_GCM_decrypt_AARCH32(Aes* aes, byte* out, const byte* in, XMEMCPY(scratch, in, partial); GCM_GMULT_LEN(&aes->gcm, x, scratch, WC_AES_BLOCK_SIZE); - AES_GCM_encrypt(in, scratch, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds, counter); + #if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) + #ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) + #else + if (0) + #endif + { + AES_GCM_encrypt_NEON(in, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } + else + #endif + { + AES_GCM_encrypt(in, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds, counter); + } XMEMCPY(out, scratch, partial); } @@ -10188,8 +10487,22 @@ static int AES_GCM_decrypt_AARCH32(Aes* aes, byte* out, const byte* in, FlattenSzInBits(&scratch[0], authInSz); FlattenSzInBits(&scratch[8], sz); GCM_GMULT_LEN(&aes->gcm, x, scratch, WC_AES_BLOCK_SIZE); - AES_ECB_encrypt(initialCounter, scratch, WC_AES_BLOCK_SIZE, - (const unsigned char*)aes->key, aes->rounds); +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (0) +#endif + { + AES_ECB_encrypt_NEON(initialCounter, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } + else +#endif + { + AES_ECB_encrypt(initialCounter, scratch, WC_AES_BLOCK_SIZE, + (const unsigned char*)aes->key, aes->rounds); + } xorbuf(x, scratch, authTagSz); if (authTag != NULL) { if (ConstantCompare(authTag, x, authTagSz) != 0) { @@ -10293,15 +10606,37 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, VECTOR_REGISTERS_PUSH; -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if defined(WOLFSSL_ARMASM) #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO +#ifndef __aarch64__ ret = AES_GCM_decrypt_AARCH32(in, out, sz, iv, ivSz, authTag, authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); #else - ret = AES_GCM_decrypt_AARCH32(aes, out, in, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz); -#endif + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + ret = AES_GCM_decrypt_AARCH64_EOR3(in, out, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, + (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); + } + else + #endif + { + ret = AES_GCM_decrypt_AARCH64(in, out, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, + (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); + } + } + else +#endif /* !__aarch64__ */ +#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + { + ret = AES_GCM_decrypt_ARM(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { @@ -10337,24 +10672,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, } } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { - #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - if (aes->use_sha3_hw_crypto) { - ret = AES_GCM_decrypt_AARCH64_EOR3(in, out, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, - (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); - } - else - #endif - { - ret = AES_GCM_decrypt_AARCH64(in, out, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz, (byte*)aes->key, aes->gcm.H, - (byte*)aes->tmp, (byte*)aes->reg, aes->rounds); - } - } - else #endif /* WOLFSSL_AESNI */ { ret = AES_GCM_decrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz, @@ -13388,19 +13705,36 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt( #else AES_ECB_encrypt(in, out, sz, (const unsigned char*)aes->key, aes->rounds); #endif +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_encrypt_blocks_AARCH64(in, out, sz, (byte*)aes->key, + (int)aes->rounds); + } + else +#endif +#if !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (sz >= 32) +#endif + { + AES_ECB_encrypt_NEON(in, out, sz, (const unsigned char*)aes->key, + aes->rounds); + } + else +#endif + { + AES_ECB_encrypt(in, out, sz, (const unsigned char*)aes->key, + aes->rounds); + } #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { - AES_encrypt_blocks_AARCH64(in, out, sz, (byte*)aes->key, - (int)aes->rounds); - } - else #endif { #if defined(NEED_AES_TABLES) @@ -13455,19 +13789,36 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt( #else AES_ECB_decrypt(in, out, sz, (const unsigned char*)aes->key, aes->rounds); #endif +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + if (aes->use_aes_hw_crypto) { + AES_decrypt_blocks_AARCH64(in, out, sz, (byte*)aes->key, + (int)aes->rounds); + } + else +#endif +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (sz >= 64) +#endif + { + AES_ECB_decrypt_NEON(in, out, sz, (const unsigned char*)aes->key, + aes->rounds); + } + else +#endif + { + AES_ECB_decrypt(in, out, sz, (const unsigned char*)aes->key, + aes->rounds); + } #else #ifdef WOLFSSL_AESNI if (aes->use_aesni) { AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds); } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { - AES_decrypt_blocks_AARCH64(in, out, sz, (byte*)aes->key, - (int)aes->rounds); - } - else #endif { #if defined(NEED_AES_TABLES) @@ -14898,14 +15249,34 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) if (aes->use_aes_hw_crypto) { AES_XTS_encrypt_AARCH64(in, out, sz, i, (byte*)xaes->aes.key, (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); ret = 0; } else +#endif +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (sz >= 32) +#endif + { + AES_XTS_encrypt_NEON(in, out, sz, i, (byte*)xaes->aes.key, + (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); + ret = 0; + } + else +#endif + if (1) { + AES_XTS_encrypt(in, out, sz, i, (byte*)xaes->aes.key, + (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); + ret = 0; + } + else #endif { ret = AesXtsEncrypt_sw(xaes, out, in, sz, i); @@ -15352,14 +15723,34 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, RESTORE_VECTOR_REGISTERS(); } else -#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) +#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) if (aes->use_aes_hw_crypto) { AES_XTS_decrypt_AARCH64(in, out, sz, i, (byte*)xaes->aes.key, (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); ret = 0; } else +#endif +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +#ifdef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP + if (1) +#else + if (sz >= 64) +#endif + { + AES_XTS_decrypt_NEON(in, out, sz, i, (byte*)xaes->aes.key, + (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); + ret = 0; + } + else +#endif + if (1) { + AES_XTS_decrypt(in, out, sz, i, (byte*)xaes->aes.key, + (byte*)xaes->tweak.key, (byte*)xaes->aes.tmp, xaes->aes.rounds); + ret = 0; + } + else #endif { ret = AesXtsDecrypt_sw(xaes, out, in, sz, i); diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm.S b/wolfcrypt/src/port/arm/armv8-aes-asm.S index 1ac5b953b..431dbb1dd 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm.S +++ b/wolfcrypt/src/port/arm/armv8-aes-asm.S @@ -43117,6 +43117,14152 @@ L_aes_xts_decrypt_arm64_crypto_done: #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#ifndef WOLFSSL_ARMASM_NO_NEON +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#ifndef __APPLE__ + .text + .type L_AES_ARM64_NEON_te, %object + .section .rodata + .size L_AES_ARM64_NEON_te, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_AES_ARM64_NEON_te: + .byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5 + .byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76 + .byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0 + .byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0 + .byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc + .byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15 + .byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a + .byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75 + .byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0 + .byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84 + .byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b + .byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf + .byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85 + .byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8 + .byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5 + .byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2 + .byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17 + .byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73 + .byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88 + .byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb + .byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c + .byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79 + .byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9 + .byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08 + .byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6 + .byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a + .byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e + .byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e + .byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94 + .byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf + .byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68 + .byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16 +#ifndef __APPLE__ + .text + .type L_AES_ARM64_NEON_shift_rows_shuffle, %object + .section .rodata + .size L_AES_ARM64_NEON_shift_rows_shuffle, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_AES_ARM64_NEON_shift_rows_shuffle: + .byte 0x0c,0x09,0x06,0x03,0x00,0x0d,0x0a,0x07 + .byte 0x04,0x01,0x0e,0x0b,0x08,0x05,0x02,0x0f +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#ifndef __APPLE__ +.text +.globl AES_invert_key_NEON +.type AES_invert_key_NEON,@function +.align 2 +AES_invert_key_NEON: +#else +.section __TEXT,__text +.globl _AES_invert_key_NEON +.p2align 2 +_AES_invert_key_NEON: +#endif /* __APPLE__ */ + add x3, x0, x1, lsl 4 + mov x2, x0 + mov w4, w1 +L_AES_invert_key_NEON_loop: + ld1 {v0.2d}, [x2] + ld1 {v1.2d}, [x3] + st1 {v0.2d}, [x3] + st1 {v1.2d}, [x2], #16 + subs w4, w4, #2 + sub x3, x3, #16 + bne L_AES_invert_key_NEON_loop + movi v2.16b, #27 + add x2, x0, #16 + sub w4, w1, #1 +L_AES_invert_key_NEON_mix_loop: + ld1 {v0.2d}, [x2] + sshr v5.16b, v0.16b, #7 + ushr v6.16b, v0.16b, #6 + ushr v3.16b, v0.16b, #5 + and v5.16b, v5.16b, v2.16b + pmul v6.16b, v6.16b, v2.16b + pmul v3.16b, v3.16b, v2.16b + shl v4.16b, v0.16b, #1 + eor v5.16b, v5.16b, v4.16b + shl v4.16b, v0.16b, #3 + eor v3.16b, v3.16b, v4.16b + shl v4.16b, v0.16b, #2 + eor v6.16b, v6.16b, v4.16b + eor v4.16b, v5.16b, v3.16b + eor v3.16b, v3.16b, v0.16b + eor v5.16b, v6.16b, v3.16b + eor v6.16b, v6.16b, v4.16b + eor v4.16b, v4.16b, v0.16b + shl v0.4s, v4.4s, #8 + rev32 v5.8h, v5.8h + sri v0.4s, v4.4s, #24 + eor v0.16b, v0.16b, v6.16b + shl v4.4s, v3.4s, #24 + eor v0.16b, v0.16b, v5.16b + sri v4.4s, v3.4s, #8 + eor v0.16b, v0.16b, v4.16b + st1 {v0.2d}, [x2], #16 + subs w4, w4, #1 + bne L_AES_invert_key_NEON_mix_loop + ret +#ifndef __APPLE__ + .size AES_invert_key_NEON,.-AES_invert_key_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AES_DECRYPT */ +#ifndef __APPLE__ + .text + .type L_AES_ARM64_NEON_rcon, %object + .section .rodata + .size L_AES_ARM64_NEON_rcon, 40 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_ARM64_NEON_rcon: + .word 0x01000000 + .word 0x02000000 + .word 0x04000000 + .word 0x08000000 + .word 0x10000000 + .word 0x20000000 + .word 0x40000000 + .word 0x80000000 + .word 0x1b000000 + .word 0x36000000 +#ifndef __APPLE__ +.text +.globl AES_set_encrypt_key_NEON +.type AES_set_encrypt_key_NEON,@function +.align 2 +AES_set_encrypt_key_NEON: +#else +.section __TEXT,__text +.globl _AES_set_encrypt_key_NEON +.p2align 2 +_AES_set_encrypt_key_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x4, L_AES_ARM64_NEON_rcon + add x4, x4, :lo12:L_AES_ARM64_NEON_rcon +#else + adrp x4, L_AES_ARM64_NEON_rcon@PAGE + add x4, x4, :lo12:L_AES_ARM64_NEON_rcon@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_NEON_te + add x5, x5, :lo12:L_AES_ARM64_NEON_te +#else + adrp x5, L_AES_ARM64_NEON_te@PAGE + add x5, x5, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v6.16b, v7.16b, v8.16b, v9.16b}, [x5], #0x40 + ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [x5], #0x40 + ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [x5], #0x40 + ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [x5] + movi v2.16b, #0x40 + movi v3.16b, #0x80 + movi v4.16b, #0xc0 + movi v5.16b, #27 + eor v26.16b, v26.16b, v26.16b + cmp x1, #0x80 + beq L_AES_set_encrypt_key_NEON_start_128 + cmp x1, #0xc0 + beq L_AES_set_encrypt_key_NEON_start_192 + ld1 {v0.16b}, [x0], #16 + ld1 {v1.16b}, [x0] + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + st1 {v0.2d}, [x2], #16 + st1 {v1.2d}, [x2], #16 + mov x3, #6 +L_AES_set_encrypt_key_NEON_loop_256: + eor v22.16b, v1.16b, v2.16b + eor v23.16b, v1.16b, v3.16b + eor v24.16b, v1.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + shl v22.4s, v25.4s, #8 + sri v22.4s, v25.4s, #24 + eor v0.16b, v0.16b, v22.16b + ld1r {v25.4s}, [x4], #4 + dup v22.4s, v0.s[0] + dup v23.2s, v0.s[1] + dup v24.2s, v0.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v0.16b, v0.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v0.16b, v0.16b, v23.16b + eor v0.16b, v0.16b, v24.16b + eor v0.16b, v0.16b, v25.16b + st1 {v0.2d}, [x2], #16 + eor v22.16b, v0.16b, v2.16b + eor v23.16b, v0.16b, v3.16b + eor v24.16b, v0.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + eor v1.16b, v1.16b, v25.16b + dup v22.4s, v1.s[0] + dup v23.2s, v1.s[1] + dup v24.2s, v1.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v1.16b, v1.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v1.16b, v1.16b, v23.16b + eor v1.16b, v1.16b, v24.16b + st1 {v1.2d}, [x2], #16 + subs x3, x3, #1 + bne L_AES_set_encrypt_key_NEON_loop_256 + eor v22.16b, v1.16b, v2.16b + eor v23.16b, v1.16b, v3.16b + eor v24.16b, v1.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + shl v22.4s, v25.4s, #8 + sri v22.4s, v25.4s, #24 + eor v0.16b, v0.16b, v22.16b + ld1r {v25.4s}, [x4], #4 + dup v22.4s, v0.s[0] + dup v23.2s, v0.s[1] + dup v24.2s, v0.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v0.16b, v0.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v0.16b, v0.16b, v23.16b + eor v0.16b, v0.16b, v24.16b + eor v0.16b, v0.16b, v25.16b + st1 {v0.2d}, [x2], #16 + b L_AES_set_encrypt_key_NEON_end +L_AES_set_encrypt_key_NEON_start_192: + ld1 {v0.16b}, [x0], #16 + ld1 {v1.8b}, [x0] + rev32 v0.16b, v0.16b + rev32 v1.8b, v1.8b + st1 {v0.16b}, [x2], #16 + st1 {v1.8b}, [x2], #8 + ext v1.16b, v1.16b, v1.16b, #8 + mov x3, #7 +L_AES_set_encrypt_key_NEON_loop_192: + eor v22.16b, v1.16b, v2.16b + eor v23.16b, v1.16b, v3.16b + eor v24.16b, v1.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + shl v22.4s, v25.4s, #8 + sri v22.4s, v25.4s, #24 + eor v0.16b, v0.16b, v22.16b + ld1r {v25.4s}, [x4], #4 + dup v22.4s, v0.s[0] + dup v23.2s, v0.s[1] + dup v24.2s, v0.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v0.16b, v0.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v0.16b, v0.16b, v23.16b + eor v0.16b, v0.16b, v24.16b + eor v0.16b, v0.16b, v25.16b + st1 {v0.2d}, [x2], #16 + mov v23.16b, v26.16b + mov v23.s[2], v0.s[3] + eor v1.16b, v1.16b, v23.16b + mov v23.16b, v26.16b + mov v23.s[3], v1.s[2] + eor v1.16b, v1.16b, v23.16b + st1 {v1.d}[1], [x2], #8 + subs x3, x3, #1 + bne L_AES_set_encrypt_key_NEON_loop_192 + eor v22.16b, v1.16b, v2.16b + eor v23.16b, v1.16b, v3.16b + eor v24.16b, v1.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + shl v22.4s, v25.4s, #8 + sri v22.4s, v25.4s, #24 + eor v0.16b, v0.16b, v22.16b + ld1r {v25.4s}, [x4], #4 + dup v22.4s, v0.s[0] + dup v23.2s, v0.s[1] + dup v24.2s, v0.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v0.16b, v0.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v0.16b, v0.16b, v23.16b + eor v0.16b, v0.16b, v24.16b + eor v0.16b, v0.16b, v25.16b + st1 {v0.2d}, [x2], #16 + b L_AES_set_encrypt_key_NEON_end +L_AES_set_encrypt_key_NEON_start_128: + ld1 {v0.16b}, [x0] + rev32 v0.16b, v0.16b + st1 {v0.2d}, [x2], #16 + mov x3, #10 +L_AES_set_encrypt_key_NEON_loop_128: + eor v22.16b, v0.16b, v2.16b + eor v23.16b, v0.16b, v3.16b + eor v24.16b, v0.16b, v4.16b + tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b + tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b + tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b + tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b + orr v25.16b, v25.16b, v22.16b + orr v23.16b, v23.16b, v24.16b + orr v25.16b, v25.16b, v23.16b + ext v25.16b, v25.16b, v26.16b, #12 + shl v22.4s, v25.4s, #8 + sri v22.4s, v25.4s, #24 + eor v0.16b, v0.16b, v22.16b + ld1r {v25.4s}, [x4], #4 + dup v22.4s, v0.s[0] + dup v23.2s, v0.s[1] + dup v24.2s, v0.s[2] + ext v22.16b, v26.16b, v22.16b, #12 + ext v23.16b, v26.16b, v23.16b, #8 + eor v0.16b, v0.16b, v22.16b + ext v24.16b, v26.16b, v24.16b, #4 + eor v0.16b, v0.16b, v23.16b + eor v0.16b, v0.16b, v24.16b + eor v0.16b, v0.16b, v25.16b + st1 {v0.2d}, [x2], #16 + subs x3, x3, #1 + bne L_AES_set_encrypt_key_NEON_loop_128 +L_AES_set_encrypt_key_NEON_end: + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_set_encrypt_key_NEON,.-AES_set_encrypt_key_NEON +#endif /* __APPLE__ */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_ECB) +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt_NEON +.type AES_ECB_encrypt_NEON,@function +.align 2 +AES_ECB_encrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt_NEON +.p2align 2 +_AES_ECB_encrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_NEON_te + add x5, x5, :lo12:L_AES_ARM64_NEON_te +#else + adrp x5, L_AES_ARM64_NEON_te@PAGE + add x5, x5, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_NEON_shift_rows_shuffle + add x6, x6, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x6, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x6, x6, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x5], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x5], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x5] + cmp x2, #0x40 + blt L_AES_ECB_encrypt_NEON_start_2 +L_AES_ECB_encrypt_NEON_loop_4: + mov x8, x3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_encrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + movi v4.16b, #27 + and v8.16b, v8.16b, v4.16b + and v9.16b, v9.16b, v4.16b + and v10.16b, v10.16b, v4.16b + and v11.16b, v11.16b, v4.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + eor v6.16b, v10.16b, v2.16b + eor v7.16b, v11.16b, v3.16b + shl v12.4s, v4.4s, #8 + shl v13.4s, v5.4s, #8 + shl v14.4s, v6.4s, #8 + shl v15.4s, v7.4s, #8 + sri v12.4s, v4.4s, #24 + sri v13.4s, v5.4s, #24 + sri v14.4s, v6.4s, #24 + sri v15.4s, v7.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + shl v6.4s, v2.4s, #24 + shl v7.4s, v3.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + sri v6.4s, v2.4s, #8 + sri v7.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + # Round Done + subs w7, w7, #2 + bne L_AES_ECB_encrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_ECB_encrypt_NEON_loop_4 +L_AES_ECB_encrypt_NEON_start_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + cmp x2, #16 + beq L_AES_ECB_encrypt_NEON_start_1 + blt L_AES_ECB_encrypt_NEON_data_done +L_AES_ECB_encrypt_NEON_loop_2: + mov x8, x3 + ld1 {v0.16b, v1.16b}, [x0], #32 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_encrypt_NEON_loop_nr_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v10.16b, v0.16b, #1 + shl v11.16b, v1.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + shl v10.4s, v4.4s, #8 + shl v11.4s, v5.4s, #8 + sri v10.4s, v4.4s, #24 + sri v11.4s, v5.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # Round Done + subs w7, w7, #2 + bne L_AES_ECB_encrypt_NEON_loop_nr_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + st1 {v0.16b, v1.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #0 + beq L_AES_ECB_encrypt_NEON_data_done +L_AES_ECB_encrypt_NEON_start_1: + ld1 {v3.2d}, [x6] + mov x8, x3 + ld1 {v0.16b}, [x0], #16 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_encrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x8], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x8], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w7, w7, #2 + bne L_AES_ECB_encrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x8], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x8], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + st1 {v0.16b}, [x1], #16 +L_AES_ECB_encrypt_NEON_data_done: + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_ECB_encrypt_NEON,.-AES_ECB_encrypt_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || + * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */ +#ifdef HAVE_AES_CBC +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt_NEON +.type AES_CBC_encrypt_NEON,@function +.align 2 +AES_CBC_encrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt_NEON +.p2align 2 +_AES_CBC_encrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_NEON_te + add x6, x6, :lo12:L_AES_ARM64_NEON_te +#else + adrp x6, L_AES_ARM64_NEON_te@PAGE + add x6, x6, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_AES_ARM64_NEON_shift_rows_shuffle + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x7, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [x6], #0x40 + ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [x6], #0x40 + ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [x6], #0x40 + ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [x6] + movi v6.16b, #0x40 + movi v7.16b, #0x80 + movi v8.16b, #0xc0 + movi v9.16b, #27 + ld1 {v0.2d}, [x5] + ld1 {v26.2d}, [x7] +L_AES_CBC_encrypt_NEON_loop_block: + add x9, x3, #16 + ld1 {v1.16b}, [x0], #16 + ld1 {v2.16b}, [x3] + eor v0.16b, v0.16b, v1.16b + rev32 v0.16b, v0.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v2.16b + sub w8, w4, #2 +L_AES_CBC_encrypt_NEON_loop_nr: + eor v2.16b, v0.16b, v6.16b + eor v3.16b, v0.16b, v7.16b + eor v4.16b, v0.16b, v8.16b + tbl v1.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v0.16b + tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b + tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b + tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b + orr v1.16b, v1.16b, v2.16b + orr v3.16b, v3.16b, v4.16b + orr v1.16b, v1.16b, v3.16b + tbl v1.16b, {v1.16b}, v26.16b + ld1 {v0.2d}, [x9], #16 + sshr v4.16b, v1.16b, #7 + shl v3.16b, v1.16b, #1 + and v4.16b, v4.16b, v9.16b + eor v4.16b, v4.16b, v3.16b + rev32 v2.8h, v1.8h + eor v5.16b, v4.16b, v1.16b + eor v4.16b, v4.16b, v2.16b + shl v3.4s, v1.4s, #24 + shl v2.4s, v5.4s, #8 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + sri v3.4s, v1.4s, #8 + sri v2.4s, v5.4s, #24 + eor v1.16b, v4.16b, v3.16b + eor v1.16b, v1.16b, v2.16b + eor v2.16b, v1.16b, v6.16b + eor v3.16b, v1.16b, v7.16b + eor v4.16b, v1.16b, v8.16b + tbl v0.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v1.16b + tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b + tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b + tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b + orr v0.16b, v0.16b, v2.16b + orr v3.16b, v3.16b, v4.16b + orr v0.16b, v0.16b, v3.16b + tbl v0.16b, {v0.16b}, v26.16b + ld1 {v1.2d}, [x9], #16 + sshr v4.16b, v0.16b, #7 + shl v3.16b, v0.16b, #1 + and v4.16b, v4.16b, v9.16b + eor v4.16b, v4.16b, v3.16b + rev32 v2.8h, v0.8h + eor v5.16b, v4.16b, v0.16b + eor v4.16b, v4.16b, v2.16b + shl v3.4s, v0.4s, #24 + shl v2.4s, v5.4s, #8 + # XOR in Key Schedule + eor v4.16b, v4.16b, v1.16b + sri v3.4s, v0.4s, #8 + sri v2.4s, v5.4s, #24 + eor v0.16b, v4.16b, v3.16b + eor v0.16b, v0.16b, v2.16b + subs w8, w8, #2 + bne L_AES_CBC_encrypt_NEON_loop_nr + eor v2.16b, v0.16b, v6.16b + eor v3.16b, v0.16b, v7.16b + eor v4.16b, v0.16b, v8.16b + tbl v1.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v0.16b + tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b + tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b + tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b + orr v1.16b, v1.16b, v2.16b + orr v3.16b, v3.16b, v4.16b + orr v1.16b, v1.16b, v3.16b + tbl v1.16b, {v1.16b}, v26.16b + ld1 {v0.2d}, [x9], #16 + sshr v4.16b, v1.16b, #7 + shl v3.16b, v1.16b, #1 + and v4.16b, v4.16b, v9.16b + eor v4.16b, v4.16b, v3.16b + rev32 v2.8h, v1.8h + eor v5.16b, v4.16b, v1.16b + eor v4.16b, v4.16b, v2.16b + shl v3.4s, v1.4s, #24 + shl v2.4s, v5.4s, #8 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + sri v3.4s, v1.4s, #8 + sri v2.4s, v5.4s, #24 + eor v1.16b, v4.16b, v3.16b + eor v1.16b, v1.16b, v2.16b + eor v2.16b, v1.16b, v6.16b + eor v3.16b, v1.16b, v7.16b + eor v4.16b, v1.16b, v8.16b + tbl v0.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v1.16b + tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b + tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b + tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b + orr v0.16b, v0.16b, v2.16b + orr v3.16b, v3.16b, v4.16b + orr v0.16b, v0.16b, v3.16b + tbl v0.16b, {v0.16b}, v26.16b + ld1 {v1.2d}, [x9], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v1.16b + rev32 v0.16b, v0.16b + st1 {v0.16b}, [x1], #16 + subs x2, x2, #16 + bne L_AES_CBC_encrypt_NEON_loop_block + st1 {v0.2d}, [x5] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_CBC_encrypt_NEON,.-AES_CBC_encrypt_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt_NEON +.type AES_CTR_encrypt_NEON,@function +.align 2 +AES_CTR_encrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt_NEON +.p2align 2 +_AES_CTR_encrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_NEON_te + add x6, x6, :lo12:L_AES_ARM64_NEON_te +#else + adrp x6, L_AES_ARM64_NEON_te@PAGE + add x6, x6, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_AES_ARM64_NEON_shift_rows_shuffle + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x7, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x6], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x6], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x6], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + ld1 {v2.2d}, [x5] + rev64 v8.16b, v2.16b + rev32 v2.16b, v2.16b + mov x10, v8.d[1] + mov x11, v8.d[0] + cmp x2, #0x40 + blt L_AES_CTR_encrypt_NEON_start_2 +L_AES_CTR_encrypt_NEON_loop_4: + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + mov v8.d[1], x10 + mov v8.d[0], x11 + rev64 v8.16b, v8.16b + rev32 v8.16b, v8.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v8.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v8.d[1], x10 + mov v8.d[0], x11 + rev64 v8.16b, v8.16b + rev32 v8.16b, v8.16b + eor v1.16b, v8.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v8.d[1], x10 + mov v8.d[0], x11 + rev64 v8.16b, v8.16b + rev32 v8.16b, v8.16b + eor v2.16b, v8.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v8.d[1], x10 + mov v8.d[0], x11 + rev64 v8.16b, v8.16b + rev32 v8.16b, v8.16b + eor v3.16b, v8.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v8.d[1], x10 + mov v8.d[0], x11 + rev64 v8.16b, v8.16b + rev32 v8.16b, v8.16b + sub w8, w4, #2 +L_AES_CTR_encrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + movi v4.16b, #27 + and v8.16b, v8.16b, v4.16b + and v9.16b, v9.16b, v4.16b + and v10.16b, v10.16b, v4.16b + and v11.16b, v11.16b, v4.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + eor v6.16b, v10.16b, v2.16b + eor v7.16b, v11.16b, v3.16b + shl v12.4s, v4.4s, #8 + shl v13.4s, v5.4s, #8 + shl v14.4s, v6.4s, #8 + shl v15.4s, v7.4s, #8 + sri v12.4s, v4.4s, #24 + sri v13.4s, v5.4s, #24 + sri v14.4s, v6.4s, #24 + sri v15.4s, v7.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + shl v6.4s, v2.4s, #24 + shl v7.4s, v3.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + sri v6.4s, v2.4s, #8 + sri v7.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + # Round Done + subs w8, w8, #2 + bne L_AES_CTR_encrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_CTR_encrypt_NEON_loop_4 + mov v2.d[1], x10 + mov v2.d[0], x11 + rev64 v2.16b, v2.16b + rev32 v2.16b, v2.16b +L_AES_CTR_encrypt_NEON_start_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + cmp x2, #16 + beq L_AES_CTR_encrypt_NEON_start_1 + blt L_AES_CTR_encrypt_NEON_data_done +L_AES_CTR_encrypt_NEON_loop_2: + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + # Round: 0 - XOR in key schedule + eor v0.16b, v2.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v2.d[1], x10 + mov v2.d[0], x11 + rev64 v2.16b, v2.16b + rev32 v2.16b, v2.16b + eor v1.16b, v2.16b, v4.16b + adds x10, x10, #1 + adc x11, x11, xzr + mov v2.d[1], x10 + mov v2.d[0], x11 + rev64 v2.16b, v2.16b + rev32 v2.16b, v2.16b + sub w8, w4, #2 +L_AES_CTR_encrypt_NEON_loop_nr_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v10.16b, v0.16b, #1 + shl v11.16b, v1.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + shl v10.4s, v4.4s, #8 + shl v11.4s, v5.4s, #8 + sri v10.4s, v4.4s, #24 + sri v11.4s, v5.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # Round Done + subs w8, w8, #2 + bne L_AES_CTR_encrypt_NEON_loop_nr_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x7] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x7] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + ld1 {v4.16b, v5.16b}, [x0], #32 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + st1 {v0.16b, v1.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #0 + beq L_AES_CTR_encrypt_NEON_data_done +L_AES_CTR_encrypt_NEON_start_1: + ld1 {v3.2d}, [x7] + mov x9, x3 + ld1 {v4.2d}, [x9], #16 + # Round: 0 - XOR in key schedule + eor v0.16b, v2.16b, v4.16b + sub w8, w4, #2 +L_AES_CTR_encrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x9], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x9], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w8, w8, #2 + bne L_AES_CTR_encrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x9], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x9], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + ld1 {v4.16b}, [x0], #16 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x1], #16 + adds x10, x10, #1 + adc x11, x11, xzr + mov v2.d[1], x10 + mov v2.d[0], x11 + rev64 v2.16b, v2.16b + rev32 v2.16b, v2.16b +L_AES_CTR_encrypt_NEON_data_done: + rev32 v2.16b, v2.16b + st1 {v2.2d}, [x5] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_CTR_encrypt_NEON,.-AES_CTR_encrypt_NEON +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB) +#ifndef __APPLE__ + .text + .type L_AES_ARM64_NEON_td, %object + .section .rodata + .size L_AES_ARM64_NEON_td, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_AES_ARM64_NEON_td: + .byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 + .byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb + .byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 + .byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb + .byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d + .byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e + .byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 + .byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 + .byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 + .byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 + .byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda + .byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 + .byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a + .byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 + .byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 + .byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b + .byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea + .byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 + .byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 + .byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e + .byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 + .byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b + .byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 + .byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 + .byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 + .byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f + .byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d + .byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef + .byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 + .byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 + .byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 + .byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +#ifndef __APPLE__ + .text + .type L_AES_ARM64_NEON_shift_rows_invshuffle, %object + .section .rodata + .size L_AES_ARM64_NEON_shift_rows_invshuffle, 16 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_AES_ARM64_NEON_shift_rows_invshuffle: + .byte 0x04,0x09,0x0e,0x03,0x08,0x0d,0x02,0x07 + .byte 0x0c,0x01,0x06,0x0b,0x00,0x05,0x0a,0x0f +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB) +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt_NEON +.type AES_ECB_decrypt_NEON,@function +.align 2 +AES_ECB_decrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt_NEON +.p2align 2 +_AES_ECB_decrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_NEON_td + add x5, x5, :lo12:L_AES_ARM64_NEON_td +#else + adrp x5, L_AES_ARM64_NEON_td@PAGE + add x5, x5, :lo12:L_AES_ARM64_NEON_td@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_NEON_shift_rows_invshuffle + add x6, x6, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle +#else + adrp x6, L_AES_ARM64_NEON_shift_rows_invshuffle@PAGE + add x6, x6, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x5], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x5], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x5] + cmp x2, #0x40 + blt L_AES_ECB_decrypt_NEON_start_2 +L_AES_ECB_decrypt_NEON_loop_4: + mov x8, x3 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_decrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + movi v28.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + ushr v14.16b, v6.16b, #6 + ushr v15.16b, v7.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + shl v2.16b, v6.16b, #2 + shl v3.16b, v7.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + ushr v2.16b, v6.16b, #5 + ushr v3.16b, v7.16b, #5 + pmul v0.16b, v0.16b, v28.16b + pmul v1.16b, v1.16b, v28.16b + pmul v2.16b, v2.16b, v28.16b + pmul v3.16b, v3.16b, v28.16b + shl v28.16b, v4.16b, #3 + shl v29.16b, v5.16b, #3 + shl v30.16b, v6.16b, #3 + shl v31.16b, v7.16b, #3 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + eor v28.16b, v8.16b, v0.16b + eor v29.16b, v9.16b, v1.16b + eor v30.16b, v10.16b, v2.16b + eor v31.16b, v11.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v10.16b, v14.16b, v2.16b + eor v11.16b, v15.16b, v3.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + shl v4.4s, v28.4s, #8 + shl v5.4s, v29.4s, #8 + shl v6.4s, v30.4s, #8 + shl v7.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v4.4s, v28.4s, #24 + sri v5.4s, v29.4s, #24 + sri v6.4s, v30.4s, #24 + sri v7.4s, v31.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + shl v28.4s, v0.4s, #24 + shl v29.4s, v1.4s, #24 + shl v30.4s, v2.4s, #24 + shl v31.4s, v3.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + sri v28.4s, v0.4s, #8 + sri v29.4s, v1.4s, #8 + sri v30.4s, v2.4s, #8 + sri v31.4s, v3.4s, #8 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x5] + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + movi v28.16b, #27 + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v0.16b, #6 + ushr v13.16b, v1.16b, #6 + ushr v14.16b, v2.16b, #6 + ushr v15.16b, v3.16b, #6 + shl v4.16b, v0.16b, #2 + shl v5.16b, v1.16b, #2 + shl v6.16b, v2.16b, #2 + shl v7.16b, v3.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + eor v14.16b, v14.16b, v6.16b + eor v15.16b, v15.16b, v7.16b + ushr v4.16b, v0.16b, #5 + ushr v5.16b, v1.16b, #5 + ushr v6.16b, v2.16b, #5 + ushr v7.16b, v3.16b, #5 + pmul v4.16b, v4.16b, v28.16b + pmul v5.16b, v5.16b, v28.16b + pmul v6.16b, v6.16b, v28.16b + pmul v7.16b, v7.16b, v28.16b + shl v28.16b, v0.16b, #3 + shl v29.16b, v1.16b, #3 + shl v30.16b, v2.16b, #3 + shl v31.16b, v3.16b, #3 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + eor v28.16b, v8.16b, v4.16b + eor v29.16b, v9.16b, v5.16b + eor v30.16b, v10.16b, v6.16b + eor v31.16b, v11.16b, v7.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + eor v8.16b, v12.16b, v4.16b + eor v9.16b, v13.16b, v5.16b + eor v10.16b, v14.16b, v6.16b + eor v11.16b, v15.16b, v7.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + shl v0.4s, v28.4s, #8 + shl v1.4s, v29.4s, #8 + shl v2.4s, v30.4s, #8 + shl v3.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v0.4s, v28.4s, #24 + sri v1.4s, v29.4s, #24 + sri v2.4s, v30.4s, #24 + sri v3.4s, v31.4s, #24 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + shl v28.4s, v4.4s, #24 + shl v29.4s, v5.4s, #24 + shl v30.4s, v6.4s, #24 + shl v31.4s, v7.4s, #24 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + sri v28.4s, v4.4s, #8 + sri v29.4s, v5.4s, #8 + sri v30.4s, v6.4s, #8 + sri v31.4s, v7.4s, #8 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x5] + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + subs w7, w7, #2 + bne L_AES_ECB_decrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + movi v28.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + ushr v14.16b, v6.16b, #6 + ushr v15.16b, v7.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + shl v2.16b, v6.16b, #2 + shl v3.16b, v7.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + ushr v2.16b, v6.16b, #5 + ushr v3.16b, v7.16b, #5 + pmul v0.16b, v0.16b, v28.16b + pmul v1.16b, v1.16b, v28.16b + pmul v2.16b, v2.16b, v28.16b + pmul v3.16b, v3.16b, v28.16b + shl v28.16b, v4.16b, #3 + shl v29.16b, v5.16b, #3 + shl v30.16b, v6.16b, #3 + shl v31.16b, v7.16b, #3 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + eor v28.16b, v8.16b, v0.16b + eor v29.16b, v9.16b, v1.16b + eor v30.16b, v10.16b, v2.16b + eor v31.16b, v11.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v10.16b, v14.16b, v2.16b + eor v11.16b, v15.16b, v3.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + shl v4.4s, v28.4s, #8 + shl v5.4s, v29.4s, #8 + shl v6.4s, v30.4s, #8 + shl v7.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v4.4s, v28.4s, #24 + sri v5.4s, v29.4s, #24 + sri v6.4s, v30.4s, #24 + sri v7.4s, v31.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + shl v28.4s, v0.4s, #24 + shl v29.4s, v1.4s, #24 + shl v30.4s, v2.4s, #24 + shl v31.4s, v3.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + sri v28.4s, v0.4s, #8 + sri v29.4s, v1.4s, #8 + sri v30.4s, v2.4s, #8 + sri v31.4s, v3.4s, #8 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x5] + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_ECB_decrypt_NEON_loop_4 +L_AES_ECB_decrypt_NEON_start_2: + cmp x2, #16 + beq L_AES_ECB_decrypt_NEON_start_1 + blt L_AES_ECB_decrypt_NEON_data_done +L_AES_ECB_decrypt_NEON_loop_2: + mov x8, x3 + ld1 {v0.16b, v1.16b}, [x0], #32 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_decrypt_NEON_loop_nr_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + movi v10.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + pmul v0.16b, v0.16b, v10.16b + pmul v1.16b, v1.16b, v10.16b + shl v10.16b, v4.16b, #3 + shl v11.16b, v5.16b, #3 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + eor v10.16b, v8.16b, v0.16b + eor v11.16b, v9.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v5.16b + shl v4.4s, v10.4s, #8 + shl v5.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v4.4s, v10.4s, #24 + sri v5.4s, v11.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + shl v10.4s, v0.4s, #24 + shl v11.4s, v1.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + sri v10.4s, v0.4s, #8 + sri v11.4s, v1.4s, #8 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + movi v10.16b, #27 + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v0.16b, #6 + ushr v13.16b, v1.16b, #6 + shl v4.16b, v0.16b, #2 + shl v5.16b, v1.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + ushr v4.16b, v0.16b, #5 + ushr v5.16b, v1.16b, #5 + pmul v4.16b, v4.16b, v10.16b + pmul v5.16b, v5.16b, v10.16b + shl v10.16b, v0.16b, #3 + shl v11.16b, v1.16b, #3 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v10.16b, v8.16b, v4.16b + eor v11.16b, v9.16b, v5.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v8.16b, v12.16b, v4.16b + eor v9.16b, v13.16b, v5.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v0.16b + eor v11.16b, v11.16b, v1.16b + shl v0.4s, v10.4s, #8 + shl v1.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v0.4s, v10.4s, #24 + sri v1.4s, v11.4s, #24 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + shl v10.4s, v4.4s, #24 + shl v11.4s, v5.4s, #24 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + sri v10.4s, v4.4s, #8 + sri v11.4s, v5.4s, #8 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + subs w7, w7, #2 + bne L_AES_ECB_decrypt_NEON_loop_nr_2 + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x6] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + movi v10.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + pmul v0.16b, v0.16b, v10.16b + pmul v1.16b, v1.16b, v10.16b + shl v10.16b, v4.16b, #3 + shl v11.16b, v5.16b, #3 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + eor v10.16b, v8.16b, v0.16b + eor v11.16b, v9.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v5.16b + shl v4.4s, v10.4s, #8 + shl v5.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v4.4s, v10.4s, #24 + sri v5.4s, v11.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + shl v10.4s, v0.4s, #24 + shl v11.4s, v1.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + sri v10.4s, v0.4s, #8 + sri v11.4s, v1.4s, #8 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x8], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x6] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x8], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + st1 {v0.16b, v1.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #0 + beq L_AES_ECB_decrypt_NEON_data_done +L_AES_ECB_decrypt_NEON_start_1: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + ld1 {v3.2d}, [x6] + mov x8, x3 + ld1 {v0.16b}, [x0], #16 + ld1 {v4.2d}, [x8], #16 + rev32 v0.16b, v0.16b + # Round: 0 - XOR in key schedule + eor v0.16b, v0.16b, v4.16b + sub w7, w4, #2 +L_AES_ECB_decrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x8], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + sshr v10.16b, v0.16b, #7 + ushr v11.16b, v0.16b, #6 + ushr v8.16b, v0.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v0.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v0.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v0.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v0.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v0.16b + shl v0.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v0.4s, v9.4s, #24 + eor v0.16b, v0.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v0.16b, v0.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v0.16b, v0.16b, v9.16b + ld1 {v4.2d}, [x8], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + subs w7, w7, #2 + bne L_AES_ECB_decrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x8], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x8], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + st1 {v0.16b}, [x1], #16 +L_AES_ECB_decrypt_NEON_data_done: + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_ECB_decrypt_NEON,.-AES_ECB_decrypt_NEON +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */ +#ifdef HAVE_AES_CBC +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt_NEON +.type AES_CBC_decrypt_NEON,@function +.align 2 +AES_CBC_decrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt_NEON +.p2align 2 +_AES_CBC_decrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-160]! + add x29, sp, #0 + stp d8, d9, [x29, #96] + stp d10, d11, [x29, #112] + stp d12, d13, [x29, #128] + stp d14, d15, [x29, #144] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_NEON_td + add x6, x6, :lo12:L_AES_ARM64_NEON_td +#else + adrp x6, L_AES_ARM64_NEON_td@PAGE + add x6, x6, :lo12:L_AES_ARM64_NEON_td@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_AES_ARM64_NEON_shift_rows_invshuffle + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle +#else + adrp x7, L_AES_ARM64_NEON_shift_rows_invshuffle@PAGE + add x7, x7, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x6], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x6], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x6], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + ld1 {v3.2d}, [x5] + add x10, x29, #16 + cmp x2, #0x40 + blt L_AES_CBC_decrypt_NEON_start_2 +L_AES_CBC_decrypt_NEON_loop_4: + mov x9, x3 + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #0x40 + st1 {v3.2d, v4.2d, v5.2d, v6.2d}, [x10] + str q7, [x10, #64] + ld1 {v8.2d}, [x9], #16 + rev32 v4.16b, v4.16b + rev32 v5.16b, v5.16b + rev32 v6.16b, v6.16b + rev32 v7.16b, v7.16b + # Round: 0 - XOR in key schedule + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + sub w8, w4, #2 +L_AES_CBC_decrypt_NEON_loop_nr_4: + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v10.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v11.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v5.16b, v12.16b + eor v2.16b, v6.16b, v12.16b + eor v3.16b, v7.16b, v12.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b + tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + eor v0.16b, v4.16b, v13.16b + eor v1.16b, v5.16b, v13.16b + eor v2.16b, v6.16b, v13.16b + eor v3.16b, v7.16b, v13.16b + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + eor v0.16b, v4.16b, v14.16b + eor v1.16b, v5.16b, v14.16b + eor v2.16b, v6.16b, v14.16b + eor v3.16b, v7.16b, v14.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld1 {v4.16b}, [x7] + tbl v8.16b, {v8.16b}, v4.16b + tbl v9.16b, {v9.16b}, v4.16b + tbl v10.16b, {v10.16b}, v4.16b + tbl v11.16b, {v11.16b}, v4.16b + movi v28.16b, #27 + sshr v0.16b, v8.16b, #7 + sshr v1.16b, v9.16b, #7 + sshr v2.16b, v10.16b, #7 + sshr v3.16b, v11.16b, #7 + shl v12.16b, v8.16b, #1 + shl v13.16b, v9.16b, #1 + shl v14.16b, v10.16b, #1 + shl v15.16b, v11.16b, #1 + and v0.16b, v0.16b, v28.16b + and v1.16b, v1.16b, v28.16b + and v2.16b, v2.16b, v28.16b + and v3.16b, v3.16b, v28.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ushr v12.16b, v8.16b, #6 + ushr v13.16b, v9.16b, #6 + ushr v14.16b, v10.16b, #6 + ushr v15.16b, v11.16b, #6 + shl v4.16b, v8.16b, #2 + shl v5.16b, v9.16b, #2 + shl v6.16b, v10.16b, #2 + shl v7.16b, v11.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + eor v14.16b, v14.16b, v6.16b + eor v15.16b, v15.16b, v7.16b + ushr v4.16b, v8.16b, #5 + ushr v5.16b, v9.16b, #5 + ushr v6.16b, v10.16b, #5 + ushr v7.16b, v11.16b, #5 + pmul v4.16b, v4.16b, v28.16b + pmul v5.16b, v5.16b, v28.16b + pmul v6.16b, v6.16b, v28.16b + pmul v7.16b, v7.16b, v28.16b + shl v28.16b, v8.16b, #3 + shl v29.16b, v9.16b, #3 + shl v30.16b, v10.16b, #3 + shl v31.16b, v11.16b, #3 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + eor v28.16b, v0.16b, v4.16b + eor v29.16b, v1.16b, v5.16b + eor v30.16b, v2.16b, v6.16b + eor v31.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v0.16b, v12.16b, v4.16b + eor v1.16b, v13.16b, v5.16b + eor v2.16b, v14.16b, v6.16b + eor v3.16b, v15.16b, v7.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v8.16b + eor v29.16b, v29.16b, v9.16b + eor v30.16b, v30.16b, v10.16b + eor v31.16b, v31.16b, v11.16b + shl v8.4s, v28.4s, #8 + shl v9.4s, v29.4s, #8 + shl v10.4s, v30.4s, #8 + shl v11.4s, v31.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + sri v8.4s, v28.4s, #24 + sri v9.4s, v29.4s, #24 + sri v10.4s, v30.4s, #24 + sri v11.4s, v31.4s, #24 + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + shl v28.4s, v4.4s, #24 + shl v29.4s, v5.4s, #24 + shl v30.4s, v6.4s, #24 + shl v31.4s, v7.4s, #24 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v2.16b + eor v11.16b, v11.16b, v3.16b + sri v28.4s, v4.4s, #8 + sri v29.4s, v5.4s, #8 + sri v30.4s, v6.4s, #8 + sri v31.4s, v7.4s, #8 + eor v8.16b, v8.16b, v28.16b + eor v9.16b, v9.16b, v29.16b + eor v10.16b, v10.16b, v30.16b + eor v11.16b, v11.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v4.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v4.16b + # Round Done + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v10.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v11.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v9.16b, v12.16b + eor v2.16b, v10.16b, v12.16b + eor v3.16b, v11.16b, v12.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b + tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + eor v0.16b, v8.16b, v13.16b + eor v1.16b, v9.16b, v13.16b + eor v2.16b, v10.16b, v13.16b + eor v3.16b, v11.16b, v13.16b + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + eor v0.16b, v8.16b, v14.16b + eor v1.16b, v9.16b, v14.16b + eor v2.16b, v10.16b, v14.16b + eor v3.16b, v11.16b, v14.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + ld1 {v8.16b}, [x7] + tbl v4.16b, {v4.16b}, v8.16b + tbl v5.16b, {v5.16b}, v8.16b + tbl v6.16b, {v6.16b}, v8.16b + tbl v7.16b, {v7.16b}, v8.16b + movi v28.16b, #27 + sshr v0.16b, v4.16b, #7 + sshr v1.16b, v5.16b, #7 + sshr v2.16b, v6.16b, #7 + sshr v3.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + and v0.16b, v0.16b, v28.16b + and v1.16b, v1.16b, v28.16b + and v2.16b, v2.16b, v28.16b + and v3.16b, v3.16b, v28.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + ushr v14.16b, v6.16b, #6 + ushr v15.16b, v7.16b, #6 + shl v8.16b, v4.16b, #2 + shl v9.16b, v5.16b, #2 + shl v10.16b, v6.16b, #2 + shl v11.16b, v7.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v8.16b + eor v13.16b, v13.16b, v9.16b + eor v14.16b, v14.16b, v10.16b + eor v15.16b, v15.16b, v11.16b + ushr v8.16b, v4.16b, #5 + ushr v9.16b, v5.16b, #5 + ushr v10.16b, v6.16b, #5 + ushr v11.16b, v7.16b, #5 + pmul v8.16b, v8.16b, v28.16b + pmul v9.16b, v9.16b, v28.16b + pmul v10.16b, v10.16b, v28.16b + pmul v11.16b, v11.16b, v28.16b + shl v28.16b, v4.16b, #3 + shl v29.16b, v5.16b, #3 + shl v30.16b, v6.16b, #3 + shl v31.16b, v7.16b, #3 + eor v8.16b, v8.16b, v28.16b + eor v9.16b, v9.16b, v29.16b + eor v10.16b, v10.16b, v30.16b + eor v11.16b, v11.16b, v31.16b + eor v28.16b, v0.16b, v8.16b + eor v29.16b, v1.16b, v9.16b + eor v30.16b, v2.16b, v10.16b + eor v31.16b, v3.16b, v11.16b + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v5.16b + eor v10.16b, v10.16b, v6.16b + eor v11.16b, v11.16b, v7.16b + eor v0.16b, v12.16b, v8.16b + eor v1.16b, v13.16b, v9.16b + eor v2.16b, v14.16b, v10.16b + eor v3.16b, v15.16b, v11.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + shl v4.4s, v28.4s, #8 + shl v5.4s, v29.4s, #8 + shl v6.4s, v30.4s, #8 + shl v7.4s, v31.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + sri v4.4s, v28.4s, #24 + sri v5.4s, v29.4s, #24 + sri v6.4s, v30.4s, #24 + sri v7.4s, v31.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + shl v28.4s, v8.4s, #24 + shl v29.4s, v9.4s, #24 + shl v30.4s, v10.4s, #24 + shl v31.4s, v11.4s, #24 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + sri v28.4s, v8.4s, #8 + sri v29.4s, v9.4s, #8 + sri v30.4s, v10.4s, #8 + sri v31.4s, v11.4s, #8 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + # XOR in Key Schedule + ld1 {v8.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + # Round Done + subs w8, w8, #2 + bne L_AES_CBC_decrypt_NEON_loop_nr_4 + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v10.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v11.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v5.16b, v12.16b + eor v2.16b, v6.16b, v12.16b + eor v3.16b, v7.16b, v12.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b + tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + eor v0.16b, v4.16b, v13.16b + eor v1.16b, v5.16b, v13.16b + eor v2.16b, v6.16b, v13.16b + eor v3.16b, v7.16b, v13.16b + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + eor v0.16b, v4.16b, v14.16b + eor v1.16b, v5.16b, v14.16b + eor v2.16b, v6.16b, v14.16b + eor v3.16b, v7.16b, v14.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + orr v10.16b, v10.16b, v2.16b + orr v11.16b, v11.16b, v3.16b + ld1 {v4.16b}, [x7] + tbl v8.16b, {v8.16b}, v4.16b + tbl v9.16b, {v9.16b}, v4.16b + tbl v10.16b, {v10.16b}, v4.16b + tbl v11.16b, {v11.16b}, v4.16b + movi v28.16b, #27 + sshr v0.16b, v8.16b, #7 + sshr v1.16b, v9.16b, #7 + sshr v2.16b, v10.16b, #7 + sshr v3.16b, v11.16b, #7 + shl v12.16b, v8.16b, #1 + shl v13.16b, v9.16b, #1 + shl v14.16b, v10.16b, #1 + shl v15.16b, v11.16b, #1 + and v0.16b, v0.16b, v28.16b + and v1.16b, v1.16b, v28.16b + and v2.16b, v2.16b, v28.16b + and v3.16b, v3.16b, v28.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + ushr v12.16b, v8.16b, #6 + ushr v13.16b, v9.16b, #6 + ushr v14.16b, v10.16b, #6 + ushr v15.16b, v11.16b, #6 + shl v4.16b, v8.16b, #2 + shl v5.16b, v9.16b, #2 + shl v6.16b, v10.16b, #2 + shl v7.16b, v11.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + eor v14.16b, v14.16b, v6.16b + eor v15.16b, v15.16b, v7.16b + ushr v4.16b, v8.16b, #5 + ushr v5.16b, v9.16b, #5 + ushr v6.16b, v10.16b, #5 + ushr v7.16b, v11.16b, #5 + pmul v4.16b, v4.16b, v28.16b + pmul v5.16b, v5.16b, v28.16b + pmul v6.16b, v6.16b, v28.16b + pmul v7.16b, v7.16b, v28.16b + shl v28.16b, v8.16b, #3 + shl v29.16b, v9.16b, #3 + shl v30.16b, v10.16b, #3 + shl v31.16b, v11.16b, #3 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + eor v28.16b, v0.16b, v4.16b + eor v29.16b, v1.16b, v5.16b + eor v30.16b, v2.16b, v6.16b + eor v31.16b, v3.16b, v7.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v0.16b, v12.16b, v4.16b + eor v1.16b, v13.16b, v5.16b + eor v2.16b, v14.16b, v6.16b + eor v3.16b, v15.16b, v7.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v8.16b + eor v29.16b, v29.16b, v9.16b + eor v30.16b, v30.16b, v10.16b + eor v31.16b, v31.16b, v11.16b + shl v8.4s, v28.4s, #8 + shl v9.4s, v29.4s, #8 + shl v10.4s, v30.4s, #8 + shl v11.4s, v31.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + sri v8.4s, v28.4s, #24 + sri v9.4s, v29.4s, #24 + sri v10.4s, v30.4s, #24 + sri v11.4s, v31.4s, #24 + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + shl v28.4s, v4.4s, #24 + shl v29.4s, v5.4s, #24 + shl v30.4s, v6.4s, #24 + shl v31.4s, v7.4s, #24 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + eor v10.16b, v10.16b, v2.16b + eor v11.16b, v11.16b, v3.16b + sri v28.4s, v4.4s, #8 + sri v29.4s, v5.4s, #8 + sri v30.4s, v6.4s, #8 + sri v31.4s, v7.4s, #8 + eor v8.16b, v8.16b, v28.16b + eor v9.16b, v9.16b, v29.16b + eor v10.16b, v10.16b, v30.16b + eor v11.16b, v11.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x6] + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v4.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v4.16b + # Round Done + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v10.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v11.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v9.16b, v12.16b + eor v2.16b, v10.16b, v12.16b + eor v3.16b, v11.16b, v12.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b + tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + eor v0.16b, v8.16b, v13.16b + eor v1.16b, v9.16b, v13.16b + eor v2.16b, v10.16b, v13.16b + eor v3.16b, v11.16b, v13.16b + tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + eor v0.16b, v8.16b, v14.16b + eor v1.16b, v9.16b, v14.16b + eor v2.16b, v10.16b, v14.16b + eor v3.16b, v11.16b, v14.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + orr v6.16b, v6.16b, v2.16b + orr v7.16b, v7.16b, v3.16b + ld1 {v8.16b}, [x7] + tbl v4.16b, {v4.16b}, v8.16b + tbl v5.16b, {v5.16b}, v8.16b + tbl v6.16b, {v6.16b}, v8.16b + tbl v7.16b, {v7.16b}, v8.16b + # XOR in Key Schedule + ld1 {v8.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + eor v6.16b, v6.16b, v8.16b + eor v7.16b, v7.16b, v8.16b + # Round Done + rev32 v4.16b, v4.16b + rev32 v5.16b, v5.16b + rev32 v6.16b, v6.16b + rev32 v7.16b, v7.16b + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x10] + ldr q3, [x10, #64] + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_CBC_decrypt_NEON_loop_4 +L_AES_CBC_decrypt_NEON_start_2: + cmp x2, #16 + beq L_AES_CBC_decrypt_NEON_start_1 + blt L_AES_CBC_decrypt_NEON_data_done +L_AES_CBC_decrypt_NEON_loop_2: + mov x9, x3 + ld1 {v4.16b, v5.16b}, [x0], #32 + st1 {v3.2d, v4.2d, v5.2d}, [x10] + ld1 {v8.2d}, [x9], #16 + rev32 v4.16b, v4.16b + rev32 v5.16b, v5.16b + # Round: 0 - XOR in key schedule + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + sub w8, w4, #2 +L_AES_CBC_decrypt_NEON_loop_nr_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v5.16b, v12.16b + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + eor v2.16b, v4.16b, v13.16b + eor v3.16b, v5.16b, v13.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + eor v0.16b, v4.16b, v14.16b + eor v1.16b, v5.16b, v14.16b + orr v8.16b, v8.16b, v2.16b + orr v9.16b, v9.16b, v3.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + ld1 {v4.16b}, [x7] + tbl v8.16b, {v8.16b}, v4.16b + tbl v9.16b, {v9.16b}, v4.16b + movi v2.16b, #27 + sshr v0.16b, v8.16b, #7 + sshr v1.16b, v9.16b, #7 + shl v12.16b, v8.16b, #1 + shl v13.16b, v9.16b, #1 + and v0.16b, v0.16b, v2.16b + and v1.16b, v1.16b, v2.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + ushr v12.16b, v8.16b, #6 + ushr v13.16b, v9.16b, #6 + shl v4.16b, v8.16b, #2 + shl v5.16b, v9.16b, #2 + pmul v12.16b, v12.16b, v2.16b + pmul v13.16b, v13.16b, v2.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + ushr v4.16b, v8.16b, #5 + ushr v5.16b, v9.16b, #5 + pmul v4.16b, v4.16b, v2.16b + pmul v5.16b, v5.16b, v2.16b + shl v2.16b, v8.16b, #3 + shl v3.16b, v9.16b, #3 + eor v4.16b, v4.16b, v2.16b + eor v5.16b, v5.16b, v3.16b + eor v2.16b, v0.16b, v4.16b + eor v3.16b, v1.16b, v5.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v0.16b, v12.16b, v4.16b + eor v1.16b, v13.16b, v5.16b + eor v12.16b, v12.16b, v2.16b + eor v13.16b, v13.16b, v3.16b + eor v2.16b, v2.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + shl v8.4s, v2.4s, #8 + shl v9.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + sri v8.4s, v2.4s, #24 + sri v9.4s, v3.4s, #24 + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + shl v2.4s, v4.4s, #24 + shl v3.4s, v5.4s, #24 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + sri v2.4s, v4.4s, #8 + sri v3.4s, v5.4s, #8 + eor v8.16b, v8.16b, v2.16b + eor v9.16b, v9.16b, v3.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v4.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v9.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + eor v2.16b, v8.16b, v13.16b + eor v3.16b, v9.16b, v13.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + eor v0.16b, v8.16b, v14.16b + eor v1.16b, v9.16b, v14.16b + orr v4.16b, v4.16b, v2.16b + orr v5.16b, v5.16b, v3.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + ld1 {v8.16b}, [x7] + tbl v4.16b, {v4.16b}, v8.16b + tbl v5.16b, {v5.16b}, v8.16b + movi v2.16b, #27 + sshr v0.16b, v4.16b, #7 + sshr v1.16b, v5.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + and v0.16b, v0.16b, v2.16b + and v1.16b, v1.16b, v2.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + shl v8.16b, v4.16b, #2 + shl v9.16b, v5.16b, #2 + pmul v12.16b, v12.16b, v2.16b + pmul v13.16b, v13.16b, v2.16b + eor v12.16b, v12.16b, v8.16b + eor v13.16b, v13.16b, v9.16b + ushr v8.16b, v4.16b, #5 + ushr v9.16b, v5.16b, #5 + pmul v8.16b, v8.16b, v2.16b + pmul v9.16b, v9.16b, v2.16b + shl v2.16b, v4.16b, #3 + shl v3.16b, v5.16b, #3 + eor v8.16b, v8.16b, v2.16b + eor v9.16b, v9.16b, v3.16b + eor v2.16b, v0.16b, v8.16b + eor v3.16b, v1.16b, v9.16b + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v5.16b + eor v0.16b, v12.16b, v8.16b + eor v1.16b, v13.16b, v9.16b + eor v12.16b, v12.16b, v2.16b + eor v13.16b, v13.16b, v3.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v5.16b + shl v4.4s, v2.4s, #8 + shl v5.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + sri v4.4s, v2.4s, #24 + sri v5.4s, v3.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + shl v2.4s, v8.4s, #24 + shl v3.4s, v9.4s, #24 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + sri v2.4s, v8.4s, #8 + sri v3.4s, v9.4s, #8 + eor v4.16b, v4.16b, v2.16b + eor v5.16b, v5.16b, v3.16b + # XOR in Key Schedule + ld1 {v8.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + # Round Done + subs w8, w8, #2 + bne L_AES_CBC_decrypt_NEON_loop_nr_2 + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v5.16b, v12.16b + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + eor v2.16b, v4.16b, v13.16b + eor v3.16b, v5.16b, v13.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + eor v0.16b, v4.16b, v14.16b + eor v1.16b, v5.16b, v14.16b + orr v8.16b, v8.16b, v2.16b + orr v9.16b, v9.16b, v3.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + orr v8.16b, v8.16b, v0.16b + orr v9.16b, v9.16b, v1.16b + ld1 {v4.16b}, [x7] + tbl v8.16b, {v8.16b}, v4.16b + tbl v9.16b, {v9.16b}, v4.16b + movi v2.16b, #27 + sshr v0.16b, v8.16b, #7 + sshr v1.16b, v9.16b, #7 + shl v12.16b, v8.16b, #1 + shl v13.16b, v9.16b, #1 + and v0.16b, v0.16b, v2.16b + and v1.16b, v1.16b, v2.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + ushr v12.16b, v8.16b, #6 + ushr v13.16b, v9.16b, #6 + shl v4.16b, v8.16b, #2 + shl v5.16b, v9.16b, #2 + pmul v12.16b, v12.16b, v2.16b + pmul v13.16b, v13.16b, v2.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + ushr v4.16b, v8.16b, #5 + ushr v5.16b, v9.16b, #5 + pmul v4.16b, v4.16b, v2.16b + pmul v5.16b, v5.16b, v2.16b + shl v2.16b, v8.16b, #3 + shl v3.16b, v9.16b, #3 + eor v4.16b, v4.16b, v2.16b + eor v5.16b, v5.16b, v3.16b + eor v2.16b, v0.16b, v4.16b + eor v3.16b, v1.16b, v5.16b + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v0.16b, v12.16b, v4.16b + eor v1.16b, v13.16b, v5.16b + eor v12.16b, v12.16b, v2.16b + eor v13.16b, v13.16b, v3.16b + eor v2.16b, v2.16b, v8.16b + eor v3.16b, v3.16b, v9.16b + shl v8.4s, v2.4s, #8 + shl v9.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + sri v8.4s, v2.4s, #24 + sri v9.4s, v3.4s, #24 + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + shl v2.4s, v4.4s, #24 + shl v3.4s, v5.4s, #24 + eor v8.16b, v8.16b, v0.16b + eor v9.16b, v9.16b, v1.16b + sri v2.4s, v4.4s, #8 + sri v3.4s, v5.4s, #8 + eor v8.16b, v8.16b, v2.16b + eor v9.16b, v9.16b, v3.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x9], #16 + eor v8.16b, v8.16b, v4.16b + eor v9.16b, v9.16b, v4.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v9.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b + eor v2.16b, v8.16b, v13.16b + eor v3.16b, v9.16b, v13.16b + tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b + tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + eor v0.16b, v8.16b, v14.16b + eor v1.16b, v9.16b, v14.16b + orr v4.16b, v4.16b, v2.16b + orr v5.16b, v5.16b, v3.16b + tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b + tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b + orr v4.16b, v4.16b, v0.16b + orr v5.16b, v5.16b, v1.16b + ld1 {v8.16b}, [x7] + tbl v4.16b, {v4.16b}, v8.16b + tbl v5.16b, {v5.16b}, v8.16b + # XOR in Key Schedule + ld1 {v8.2d}, [x9], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v8.16b + # Round Done + rev32 v4.16b, v4.16b + rev32 v5.16b, v5.16b + ld1 {v1.16b, v2.16b, v3.16b}, [x10] + eor v4.16b, v4.16b, v1.16b + eor v5.16b, v5.16b, v2.16b + st1 {v4.16b, v5.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #32 + bge L_AES_CBC_decrypt_NEON_loop_2 + cmp x2, #0 + beq L_AES_CBC_decrypt_NEON_data_done +L_AES_CBC_decrypt_NEON_start_1: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + ld1 {v7.2d}, [x7] + mov x9, x3 + ld1 {v4.16b}, [x0], #16 + mov v10.16b, v3.16b + mov v11.16b, v4.16b + ld1 {v8.16b}, [x9], #16 + rev32 v4.16b, v4.16b + # Round: 0 - XOR in key schedule + eor v4.16b, v4.16b, v8.16b + sub w8, w4, #2 +L_AES_CBC_decrypt_NEON_loop_nr_1: + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v4.16b, v13.16b + eor v2.16b, v4.16b, v14.16b + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + orr v8.16b, v8.16b, v0.16b + orr v1.16b, v1.16b, v2.16b + orr v8.16b, v8.16b, v1.16b + tbl v8.16b, {v8.16b}, v7.16b + sshr v2.16b, v8.16b, #7 + ushr v3.16b, v8.16b, #6 + ushr v0.16b, v8.16b, #5 + and v2.16b, v2.16b, v15.16b + pmul v3.16b, v3.16b, v15.16b + pmul v0.16b, v0.16b, v15.16b + shl v1.16b, v8.16b, #1 + eor v2.16b, v2.16b, v1.16b + shl v1.16b, v8.16b, #3 + eor v0.16b, v0.16b, v1.16b + shl v1.16b, v8.16b, #2 + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v2.16b, v0.16b + eor v0.16b, v0.16b, v8.16b + eor v2.16b, v3.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v1.16b, v8.16b + shl v8.4s, v1.4s, #8 + rev32 v2.8h, v2.8h + sri v8.4s, v1.4s, #24 + eor v8.16b, v8.16b, v3.16b + shl v1.4s, v0.4s, #24 + eor v8.16b, v8.16b, v2.16b + sri v1.4s, v0.4s, #8 + eor v8.16b, v8.16b, v1.16b + ld1 {v4.2d}, [x9], #16 + # XOR in Key Schedule + eor v8.16b, v8.16b, v4.16b + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v8.16b, v13.16b + eor v2.16b, v8.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + orr v4.16b, v4.16b, v0.16b + orr v1.16b, v1.16b, v2.16b + orr v4.16b, v4.16b, v1.16b + tbl v4.16b, {v4.16b}, v7.16b + sshr v2.16b, v4.16b, #7 + ushr v3.16b, v4.16b, #6 + ushr v0.16b, v4.16b, #5 + and v2.16b, v2.16b, v15.16b + pmul v3.16b, v3.16b, v15.16b + pmul v0.16b, v0.16b, v15.16b + shl v1.16b, v4.16b, #1 + eor v2.16b, v2.16b, v1.16b + shl v1.16b, v4.16b, #3 + eor v0.16b, v0.16b, v1.16b + shl v1.16b, v4.16b, #2 + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v2.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + eor v2.16b, v3.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v1.16b, v4.16b + shl v4.4s, v1.4s, #8 + rev32 v2.8h, v2.8h + sri v4.4s, v1.4s, #24 + eor v4.16b, v4.16b, v3.16b + shl v1.4s, v0.4s, #24 + eor v4.16b, v4.16b, v2.16b + sri v1.4s, v0.4s, #8 + eor v4.16b, v4.16b, v1.16b + ld1 {v8.2d}, [x9], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v8.16b + subs w8, w8, #2 + bne L_AES_CBC_decrypt_NEON_loop_nr_1 + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v4.16b, v13.16b + eor v2.16b, v4.16b, v14.16b + tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + orr v8.16b, v8.16b, v0.16b + orr v1.16b, v1.16b, v2.16b + orr v8.16b, v8.16b, v1.16b + tbl v8.16b, {v8.16b}, v7.16b + sshr v2.16b, v8.16b, #7 + ushr v3.16b, v8.16b, #6 + ushr v0.16b, v8.16b, #5 + and v2.16b, v2.16b, v15.16b + pmul v3.16b, v3.16b, v15.16b + pmul v0.16b, v0.16b, v15.16b + shl v1.16b, v8.16b, #1 + eor v2.16b, v2.16b, v1.16b + shl v1.16b, v8.16b, #3 + eor v0.16b, v0.16b, v1.16b + shl v1.16b, v8.16b, #2 + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v2.16b, v0.16b + eor v0.16b, v0.16b, v8.16b + eor v2.16b, v3.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + eor v1.16b, v1.16b, v8.16b + shl v8.4s, v1.4s, #8 + rev32 v2.8h, v2.8h + sri v8.4s, v1.4s, #24 + eor v8.16b, v8.16b, v3.16b + shl v1.4s, v0.4s, #24 + eor v8.16b, v8.16b, v2.16b + sri v1.4s, v0.4s, #8 + eor v8.16b, v8.16b, v1.16b + ld1 {v4.2d}, [x9], #16 + # XOR in Key Schedule + eor v8.16b, v8.16b, v4.16b + eor v0.16b, v8.16b, v12.16b + eor v1.16b, v8.16b, v13.16b + eor v2.16b, v8.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b + tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b + tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b + tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b + orr v4.16b, v4.16b, v0.16b + orr v1.16b, v1.16b, v2.16b + orr v4.16b, v4.16b, v1.16b + tbl v4.16b, {v4.16b}, v7.16b + ld1 {v8.2d}, [x9], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v8.16b + rev32 v4.16b, v4.16b + mov v3.16b, v11.16b + eor v4.16b, v4.16b, v10.16b + st1 {v4.16b}, [x1], #16 +L_AES_CBC_decrypt_NEON_data_done: + st1 {v3.2d}, [x5] + ldp d8, d9, [x29, #96] + ldp d10, d11, [x29, #112] + ldp d12, d13, [x29, #128] + ldp d14, d15, [x29, #144] + ldp x29, x30, [sp], #0xa0 + ret +#ifndef __APPLE__ + .size AES_CBC_decrypt_NEON,.-AES_CBC_decrypt_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC + * HAVE_AES_ECB */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +#ifndef __APPLE__ +.text +.globl GCM_gmult_len_NEON +.type GCM_gmult_len_NEON,@function +.align 2 +GCM_gmult_len_NEON: +#else +.section __TEXT,__text +.globl _GCM_gmult_len_NEON +.p2align 2 +_GCM_gmult_len_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] + ld1 {v18.2d}, [x0] + ld1 {v10.2d}, [x1] + movi v19.16b, #15 + eor v20.16b, v20.16b, v20.16b + rbit v18.16b, v18.16b + rbit v10.16b, v10.16b + and v12.16b, v10.16b, v19.16b + ushr v13.16b, v10.16b, #4 + eor v14.16b, v12.16b, v13.16b +L_GCM_gmult_len_NEON_start_block: + ld1 {v0.16b}, [x2], #16 + rbit v0.16b, v0.16b + eor v18.16b, v18.16b, v0.16b + # Mul 128x128 + and v15.16b, v18.16b, v19.16b + ushr v16.16b, v18.16b, #4 + eor v17.16b, v15.16b, v16.16b + dup v0.16b, v12.b[0] + dup v2.16b, v14.b[0] + dup v1.16b, v13.b[0] + pmul v8.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v8.16b + eor v5.16b, v5.16b, v4.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v8.16b, v8.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + dup v0.16b, v12.b[1] + dup v2.16b, v14.b[1] + dup v1.16b, v13.b[1] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v6.16b, v20.16b, v3.16b, #15 + ext v9.16b, v3.16b, v20.16b, #15 + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[2] + dup v2.16b, v14.b[2] + dup v1.16b, v13.b[2] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #14 + ext v6.16b, v20.16b, v3.16b, #14 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[3] + dup v2.16b, v14.b[3] + dup v1.16b, v13.b[3] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #13 + ext v6.16b, v20.16b, v3.16b, #13 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[4] + dup v2.16b, v14.b[4] + dup v1.16b, v13.b[4] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #12 + ext v6.16b, v20.16b, v3.16b, #12 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[5] + dup v2.16b, v14.b[5] + dup v1.16b, v13.b[5] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #11 + ext v6.16b, v20.16b, v3.16b, #11 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[6] + dup v2.16b, v14.b[6] + dup v1.16b, v13.b[6] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #10 + ext v6.16b, v20.16b, v3.16b, #10 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[7] + dup v2.16b, v14.b[7] + dup v1.16b, v13.b[7] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #9 + ext v6.16b, v20.16b, v3.16b, #9 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[8] + dup v2.16b, v14.b[8] + dup v1.16b, v13.b[8] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #8 + ext v6.16b, v20.16b, v3.16b, #8 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[9] + dup v2.16b, v14.b[9] + dup v1.16b, v13.b[9] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #7 + ext v6.16b, v20.16b, v3.16b, #7 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[10] + dup v2.16b, v14.b[10] + dup v1.16b, v13.b[10] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #6 + ext v6.16b, v20.16b, v3.16b, #6 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[11] + dup v2.16b, v14.b[11] + dup v1.16b, v13.b[11] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #5 + ext v6.16b, v20.16b, v3.16b, #5 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[12] + dup v2.16b, v14.b[12] + dup v1.16b, v13.b[12] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #4 + ext v6.16b, v20.16b, v3.16b, #4 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[13] + dup v2.16b, v14.b[13] + dup v1.16b, v13.b[13] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #3 + ext v6.16b, v20.16b, v3.16b, #3 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[14] + dup v2.16b, v14.b[14] + dup v1.16b, v13.b[14] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #2 + ext v6.16b, v20.16b, v3.16b, #2 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + dup v0.16b, v12.b[15] + dup v2.16b, v14.b[15] + dup v1.16b, v13.b[15] + pmul v3.16b, v15.16b, v0.16b + pmul v5.16b, v17.16b, v2.16b + pmul v4.16b, v16.16b, v1.16b + eor v5.16b, v5.16b, v3.16b + eor v5.16b, v5.16b, v4.16b + eor v3.16b, v3.16b, v11.16b + shl v6.16b, v5.16b, #4 + ushr v7.16b, v5.16b, #4 + eor v3.16b, v3.16b, v6.16b + eor v11.16b, v4.16b, v7.16b + ext v7.16b, v3.16b, v20.16b, #1 + ext v6.16b, v20.16b, v3.16b, #1 + eor v9.16b, v9.16b, v7.16b + eor v8.16b, v8.16b, v6.16b + eor v9.16b, v9.16b, v11.16b + # Reduce 254-bit number + shl v0.16b, v9.16b, #1 + shl v1.16b, v9.16b, #2 + shl v2.16b, v9.16b, #7 + ushr v3.16b, v9.16b, #7 + ushr v4.16b, v9.16b, #6 + ushr v5.16b, v9.16b, #1 + eor v0.16b, v0.16b, v9.16b + eor v1.16b, v1.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + eor v8.16b, v8.16b, v0.16b + ext v0.16b, v20.16b, v3.16b, #15 + ext v1.16b, v20.16b, v4.16b, #15 + ext v2.16b, v20.16b, v5.16b, #15 + ext v4.16b, v4.16b, v20.16b, #15 + ext v5.16b, v5.16b, v20.16b, #15 + eor v0.16b, v0.16b, v1.16b + eor v8.16b, v8.16b, v2.16b + eor v8.16b, v8.16b, v0.16b + eor v3.16b, v4.16b, v5.16b + shl v0.2d, v3.2d, #1 + shl v1.2d, v3.2d, #2 + shl v2.2d, v3.2d, #7 + eor v3.16b, v3.16b, v0.16b + eor v1.16b, v1.16b, v2.16b + eor v8.16b, v8.16b, v3.16b + eor v18.16b, v8.16b, v1.16b + subs x3, x3, #16 + bne L_GCM_gmult_len_NEON_start_block + rbit v18.16b, v18.16b + st1 {v18.2d}, [x0] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size GCM_gmult_len_NEON,.-GCM_gmult_len_NEON +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt_NEON +.type AES_GCM_encrypt_NEON,@function +.align 2 +AES_GCM_encrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt_NEON +.p2align 2 +_AES_GCM_encrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x9, L_AES_ARM64_NEON_te + add x9, x9, :lo12:L_AES_ARM64_NEON_te +#else + adrp x9, L_AES_ARM64_NEON_te@PAGE + add x9, x9, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x10, L_AES_ARM64_NEON_shift_rows_shuffle + add x10, x10, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x10, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x10, x10, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x9], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x9], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x9] + ld1 {v2.2d}, [x5] + rev32 v2.16b, v2.16b + mov w6, v2.s[3] + cmp x2, #0x40 + blt L_AES_GCM_encrypt_NEON_start_2 + mov x7, v2.d[0] + mov x8, v2.d[1] +L_AES_GCM_encrypt_NEON_loop_4: + mov x12, x3 + ld1 {v4.2d}, [x12], #16 + mov v8.d[0], x7 + mov v8.d[1], x8 + # Round: 0 - XOR in key schedule + add w6, w6, #1 + mov v8.s[3], w6 + eor v0.16b, v8.16b, v4.16b + add w6, w6, #1 + mov v8.s[3], w6 + eor v1.16b, v8.16b, v4.16b + add w6, w6, #1 + mov v8.s[3], w6 + eor v2.16b, v8.16b, v4.16b + add w6, w6, #1 + mov v8.s[3], w6 + eor v3.16b, v8.16b, v4.16b + sub w11, w4, #2 +L_AES_GCM_encrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x10] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x12], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x10] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + movi v4.16b, #27 + and v8.16b, v8.16b, v4.16b + and v9.16b, v9.16b, v4.16b + and v10.16b, v10.16b, v4.16b + and v11.16b, v11.16b, v4.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + eor v6.16b, v10.16b, v2.16b + eor v7.16b, v11.16b, v3.16b + shl v12.4s, v4.4s, #8 + shl v13.4s, v5.4s, #8 + shl v14.4s, v6.4s, #8 + shl v15.4s, v7.4s, #8 + sri v12.4s, v4.4s, #24 + sri v13.4s, v5.4s, #24 + sri v14.4s, v6.4s, #24 + sri v15.4s, v7.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + shl v6.4s, v2.4s, #24 + shl v7.4s, v3.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + sri v6.4s, v2.4s, #8 + sri v7.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x12], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + # Round Done + subs w11, w11, #2 + bne L_AES_GCM_encrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x10] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x12], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x10] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x12], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #0x40 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + sub x2, x2, #0x40 + cmp x2, #0x40 + bge L_AES_GCM_encrypt_NEON_loop_4 + mov v2.d[0], x7 + mov v2.d[1], x8 + mov v2.s[3], w6 +L_AES_GCM_encrypt_NEON_start_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + cmp x2, #16 + beq L_AES_GCM_encrypt_NEON_start_1 + blt L_AES_GCM_encrypt_NEON_data_done +L_AES_GCM_encrypt_NEON_loop_2: + mov x12, x3 + ld1 {v4.2d}, [x12], #16 + # Round: 0 - XOR in key schedule + add w6, w6, #1 + mov v2.s[3], w6 + eor v0.16b, v2.16b, v4.16b + add w6, w6, #1 + mov v2.s[3], w6 + eor v1.16b, v2.16b, v4.16b + sub w11, w4, #2 +L_AES_GCM_encrypt_NEON_loop_nr_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x10] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x12], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x10] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v10.16b, v0.16b, #1 + shl v11.16b, v1.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + shl v10.4s, v4.4s, #8 + shl v11.4s, v5.4s, #8 + sri v10.4s, v4.4s, #24 + sri v11.4s, v5.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x12], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # Round Done + subs w11, w11, #2 + bne L_AES_GCM_encrypt_NEON_loop_nr_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x10] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x12], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x10] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x12], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + ld1 {v4.16b, v5.16b}, [x0], #32 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + st1 {v0.16b, v1.16b}, [x1], #32 + sub x2, x2, #32 + cmp x2, #0 + beq L_AES_GCM_encrypt_NEON_data_done +L_AES_GCM_encrypt_NEON_start_1: + ld1 {v3.2d}, [x10] + mov x12, x3 + add w6, w6, #1 + ld1 {v4.2d}, [x12], #16 + mov v2.s[3], w6 + # Round: 0 - XOR in key schedule + eor v0.16b, v2.16b, v4.16b + sub w11, w4, #2 +L_AES_GCM_encrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x12], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x12], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w11, w11, #2 + bne L_AES_GCM_encrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x12], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x12], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + ld1 {v4.16b}, [x0], #16 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x1], #16 +L_AES_GCM_encrypt_NEON_data_done: + rev32 v2.16b, v2.16b + st1 {v2.2d}, [x5] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size AES_GCM_encrypt_NEON,.-AES_GCM_encrypt_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AES_XTS +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_NEON +.type AES_XTS_encrypt_NEON,@function +.align 2 +AES_XTS_encrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_NEON +.p2align 2 +_AES_XTS_encrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-128]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + stp x20, x21, [x29, #40] + str x22, [x29, #56] + stp d8, d9, [x29, #64] + stp d10, d11, [x29, #80] + stp d12, d13, [x29, #96] + stp d14, d15, [x29, #112] +#ifndef __APPLE__ + adrp x19, L_AES_ARM64_NEON_te + add x19, x19, :lo12:L_AES_ARM64_NEON_te +#else + adrp x19, L_AES_ARM64_NEON_te@PAGE + add x19, x19, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x20, L_AES_ARM64_NEON_shift_rows_shuffle + add x20, x20, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x20, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x20, x20, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x19], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x19], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x19], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x19] + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + ld1 {v3.2d}, [x20] + mov x17, #0x87 + ld1 {v2.2d}, [x3] + ld1 {v4.2d}, [x5] + rev32 v2.16b, v2.16b + add x22, x5, #16 + # Round: 0 - XOR in key schedule + eor v2.16b, v2.16b, v4.16b + sub w21, w7, #2 +L_AES_XTS_encrypt_NEON_loop_nr_tweak: + eor v8.16b, v2.16b, v12.16b + eor v9.16b, v2.16b, v13.16b + eor v10.16b, v2.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v2.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v2.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v2.16b, v2.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v2.16b, v2.16b, v9.16b + tbl v2.16b, {v2.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + sshr v10.16b, v2.16b, #7 + shl v9.16b, v2.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v2.8h + eor v11.16b, v10.16b, v2.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v2.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v2.4s, #8 + sri v8.4s, v11.4s, #24 + eor v2.16b, v10.16b, v9.16b + eor v2.16b, v2.16b, v8.16b + subs w21, w21, #2 + bne L_AES_XTS_encrypt_NEON_loop_nr_tweak + eor v8.16b, v2.16b, v12.16b + eor v9.16b, v2.16b, v13.16b + eor v10.16b, v2.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v2.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v2.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v2.16b, v2.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v2.16b, v2.16b, v9.16b + tbl v2.16b, {v2.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + # XOR in Key Schedule + eor v2.16b, v2.16b, v4.16b + rev32 v2.16b, v2.16b + mov x8, v2.d[0] + mov x9, v2.d[1] + cmp w2, #0x40 + blt L_AES_XTS_encrypt_NEON_start_2 +L_AES_XTS_encrypt_NEON_loop_4: + mov x22, x4 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld1 {v4.16b}, [x22], #16 + and x16, x17, x9, asr 63 + extr x11, x9, x8, #63 + eor x10, x16, x8, lsl 1 + and x16, x17, x11, asr 63 + extr x13, x11, x10, #63 + eor x12, x16, x10, lsl 1 + and x16, x17, x13, asr 63 + extr x15, x13, x12, #63 + eor x14, x16, x12, lsl 1 + mov v8.d[0], x8 + mov v8.d[1], x9 + mov v9.d[0], x10 + mov v9.d[1], x11 + mov v10.d[0], x12 + mov v10.d[1], x13 + mov v11.d[0], x14 + mov v11.d[1], x15 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + sub w21, w7, #2 +L_AES_XTS_encrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x20] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x22], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x20] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + movi v4.16b, #27 + and v8.16b, v8.16b, v4.16b + and v9.16b, v9.16b, v4.16b + and v10.16b, v10.16b, v4.16b + and v11.16b, v11.16b, v4.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + eor v6.16b, v10.16b, v2.16b + eor v7.16b, v11.16b, v3.16b + shl v12.4s, v4.4s, #8 + shl v13.4s, v5.4s, #8 + shl v14.4s, v6.4s, #8 + shl v15.4s, v7.4s, #8 + sri v12.4s, v4.4s, #24 + sri v13.4s, v5.4s, #24 + sri v14.4s, v6.4s, #24 + sri v15.4s, v7.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + shl v6.4s, v2.4s, #24 + shl v7.4s, v3.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + sri v6.4s, v2.4s, #8 + sri v7.4s, v3.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + rev32 v2.8h, v2.8h + rev32 v3.8h, v3.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + # Round Done + subs w21, w21, #2 + bne L_AES_XTS_encrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x20] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + movi v0.16b, #27 + and v8.16b, v8.16b, v0.16b + and v9.16b, v9.16b, v0.16b + and v10.16b, v10.16b, v0.16b + and v11.16b, v11.16b, v0.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + eor v2.16b, v10.16b, v6.16b + eor v3.16b, v11.16b, v7.16b + shl v12.4s, v0.4s, #8 + shl v13.4s, v1.4s, #8 + shl v14.4s, v2.4s, #8 + shl v15.4s, v3.4s, #8 + sri v12.4s, v0.4s, #24 + sri v13.4s, v1.4s, #24 + sri v14.4s, v2.4s, #24 + sri v15.4s, v3.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + shl v2.4s, v6.4s, #24 + shl v3.4s, v7.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + sri v2.4s, v6.4s, #8 + sri v3.4s, v7.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + rev32 v6.8h, v6.8h + rev32 v7.8h, v7.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x22], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x20] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + mov v8.d[0], x8 + mov v8.d[1], x9 + mov v9.d[0], x10 + mov v9.d[1], x11 + mov v10.d[0], x12 + mov v10.d[1], x13 + mov v11.d[0], x14 + mov v11.d[1], x15 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + and x16, x17, x15, asr 63 + extr x9, x15, x14, #63 + eor x8, x16, x14, lsl 1 + sub w2, w2, #0x40 + cmp w2, #0x40 + bge L_AES_XTS_encrypt_NEON_loop_4 + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 +L_AES_XTS_encrypt_NEON_start_2: + cmp w2, #32 + blt L_AES_XTS_encrypt_NEON_start_1 + mov x22, x4 + ld1 {v0.16b, v1.16b}, [x0], #32 + ld1 {v4.16b}, [x22], #16 + and x16, x17, x9, asr 63 + extr x11, x9, x8, #63 + eor x10, x16, x8, lsl 1 + and x16, x17, x11, asr 63 + extr x13, x11, x10, #63 + eor x12, x16, x10, lsl 1 + mov v2.d[0], x8 + mov v2.d[1], x9 + mov v3.d[0], x10 + mov v3.d[1], x11 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + sub w21, w7, #2 +L_AES_XTS_encrypt_NEON_loop_nr_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x20] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x22], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x20] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v10.16b, v0.16b, #1 + shl v11.16b, v1.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v4.16b, v8.16b, v0.16b + eor v5.16b, v9.16b, v1.16b + shl v10.4s, v4.4s, #8 + shl v11.4s, v5.4s, #8 + sri v10.4s, v4.4s, #24 + sri v11.4s, v5.4s, #24 + shl v4.4s, v0.4s, #24 + shl v5.4s, v1.4s, #24 + sri v4.4s, v0.4s, #8 + sri v5.4s, v1.4s, #8 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # Round Done + subs w21, w21, #2 + bne L_AES_XTS_encrypt_NEON_loop_nr_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x20] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v10.16b, v4.16b, #1 + shl v11.16b, v5.16b, #1 + and v8.16b, v8.16b, v15.16b + and v9.16b, v9.16b, v15.16b + eor v8.16b, v8.16b, v10.16b + eor v9.16b, v9.16b, v11.16b + eor v0.16b, v8.16b, v4.16b + eor v1.16b, v9.16b, v5.16b + shl v10.4s, v0.4s, #8 + shl v11.4s, v1.4s, #8 + sri v10.4s, v0.4s, #24 + sri v11.4s, v1.4s, #24 + shl v0.4s, v4.4s, #24 + shl v1.4s, v5.4s, #24 + sri v0.4s, v4.4s, #8 + sri v1.4s, v5.4s, #8 + rev32 v4.8h, v4.8h + rev32 v5.8h, v5.8h + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x22], #16 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # Round Done + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x20] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + st1 {v0.16b, v1.16b}, [x1], #32 + and x16, x17, x11, asr 63 + extr x9, x11, x10, #63 + eor x8, x16, x10, lsl 1 + sub w2, w2, #32 +L_AES_XTS_encrypt_NEON_start_1: + ld1 {v3.2d}, [x20] + mov v2.d[0], x8 + mov v2.d[1], x9 + cmp w2, #16 + blt L_AES_XTS_encrypt_NEON_start_partial + mov x22, x4 + ld1 {v0.16b}, [x0], #16 + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v2.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + sub w21, w7, #2 +L_AES_XTS_encrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w21, w21, #2 + bne L_AES_XTS_encrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v2.16b + st1 {v0.16b}, [x1], #16 + subs w2, w2, #16 + beq L_AES_XTS_encrypt_NEON_data_done + and x16, x17, x9, asr 63 + extr x9, x9, x8, #63 + eor x8, x16, x8, lsl 1 +L_AES_XTS_encrypt_NEON_start_partial: + cbz w2, L_AES_XTS_encrypt_NEON_data_done + mov v2.d[0], x8 + mov v2.d[1], x9 + mov x22, x4 + sub x1, x1, #16 + ld1 {v0.16b}, [x1], #16 + st1 {v0.2d}, [x6] + mov w16, w2 +L_AES_XTS_encrypt_NEON_start_byte: + ldrb w10, [x6] + ldrb w11, [x0], #1 + strb w10, [x1], #1 + strb w11, [x6], #1 + subs w16, w16, #1 + bgt L_AES_XTS_encrypt_NEON_start_byte + sub x1, x1, x2 + sub x6, x6, x2 + sub x1, x1, #16 + ld1 {v0.2d}, [x6] + ld1 {v4.2d}, [x22], #16 + eor v0.16b, v0.16b, v2.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + sub w21, w7, #2 +L_AES_XTS_encrypt_NEON_loop_nr_partial: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + sshr v10.16b, v0.16b, #7 + shl v9.16b, v0.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v0.8h + eor v11.16b, v10.16b, v0.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v0.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v0.4s, #8 + sri v8.4s, v11.4s, #24 + eor v0.16b, v10.16b, v9.16b + eor v0.16b, v0.16b, v8.16b + subs w21, w21, #2 + bne L_AES_XTS_encrypt_NEON_loop_nr_partial + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v0.2d}, [x22], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v0.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x22], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v2.16b + st1 {v0.16b}, [x1] +L_AES_XTS_encrypt_NEON_data_done: + ldp x17, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldr x22, [x29, #56] + ldp d8, d9, [x29, #64] + ldp d10, d11, [x29, #80] + ldp d12, d13, [x29, #96] + ldp d14, d15, [x29, #112] + ldp x29, x30, [sp], #0x80 + ret +#ifndef __APPLE__ + .size AES_XTS_encrypt_NEON,.-AES_XTS_encrypt_NEON +#endif /* __APPLE__ */ +#ifdef HAVE_AES_DECRYPT +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_NEON +.type AES_XTS_decrypt_NEON,@function +.align 2 +AES_XTS_decrypt_NEON: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_NEON +.p2align 2 +_AES_XTS_decrypt_NEON: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-144]! + add x29, sp, #0 + stp x17, x19, [x29, #16] + stp x20, x21, [x29, #32] + stp x22, x23, [x29, #48] + stp x24, x25, [x29, #64] + stp d8, d9, [x29, #80] + stp d10, d11, [x29, #96] + stp d12, d13, [x29, #112] + stp d14, d15, [x29, #128] +#ifndef __APPLE__ + adrp x20, L_AES_ARM64_NEON_te + add x20, x20, :lo12:L_AES_ARM64_NEON_te +#else + adrp x20, L_AES_ARM64_NEON_te@PAGE + add x20, x20, :lo12:L_AES_ARM64_NEON_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x21, L_AES_ARM64_NEON_td + add x21, x21, :lo12:L_AES_ARM64_NEON_td +#else + adrp x21, L_AES_ARM64_NEON_td@PAGE + add x21, x21, :lo12:L_AES_ARM64_NEON_td@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x22, L_AES_ARM64_NEON_shift_rows_shuffle + add x22, x22, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle +#else + adrp x22, L_AES_ARM64_NEON_shift_rows_shuffle@PAGE + add x22, x22, :lo12:L_AES_ARM64_NEON_shift_rows_shuffle@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x23, L_AES_ARM64_NEON_shift_rows_invshuffle + add x23, x23, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle +#else + adrp x23, L_AES_ARM64_NEON_shift_rows_invshuffle@PAGE + add x23, x23, :lo12:L_AES_ARM64_NEON_shift_rows_invshuffle@PAGEOFF +#endif /* __APPLE__ */ + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x20], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x20], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x20], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x20] + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 + ld1 {v3.2d}, [x22] + mov x17, #0x87 + ands w19, w2, #15 + cset w16, ne + lsl w16, w16, #4 + sub w2, w2, w16 + ld1 {v2.2d}, [x3] + ld1 {v4.2d}, [x5] + rev32 v2.16b, v2.16b + add x25, x5, #16 + # Round: 0 - XOR in key schedule + eor v2.16b, v2.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_tweak: + eor v8.16b, v2.16b, v12.16b + eor v9.16b, v2.16b, v13.16b + eor v10.16b, v2.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v2.2d}, [x25], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v2.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v2.16b, v2.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v2.16b, v2.16b, v9.16b + tbl v2.16b, {v2.16b}, v3.16b + ld1 {v4.2d}, [x25], #16 + sshr v10.16b, v2.16b, #7 + shl v9.16b, v2.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v2.8h + eor v11.16b, v10.16b, v2.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v2.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v4.16b + sri v9.4s, v2.4s, #8 + sri v8.4s, v11.4s, #24 + eor v2.16b, v10.16b, v9.16b + eor v2.16b, v2.16b, v8.16b + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_tweak + eor v8.16b, v2.16b, v12.16b + eor v9.16b, v2.16b, v13.16b + eor v10.16b, v2.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + ld1 {v2.2d}, [x25], #16 + sshr v10.16b, v4.16b, #7 + shl v9.16b, v4.16b, #1 + and v10.16b, v10.16b, v15.16b + eor v10.16b, v10.16b, v9.16b + rev32 v8.8h, v4.8h + eor v11.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b + shl v9.4s, v4.4s, #24 + shl v8.4s, v11.4s, #8 + # XOR in Key Schedule + eor v10.16b, v10.16b, v2.16b + sri v9.4s, v4.4s, #8 + sri v8.4s, v11.4s, #24 + eor v4.16b, v10.16b, v9.16b + eor v4.16b, v4.16b, v8.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v2.16b, v2.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v2.16b, v2.16b, v9.16b + tbl v2.16b, {v2.16b}, v3.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v2.16b, v2.16b, v4.16b + rev32 v2.16b, v2.16b + mov x8, v2.d[0] + mov x9, v2.d[1] + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x21], #0x40 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x21], #0x40 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x21], #0x40 + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x21] + ld1 {v3.2d}, [x23] + cmp w2, #0x40 + blt L_AES_XTS_decrypt_NEON_start_2 +L_AES_XTS_decrypt_NEON_loop_4: + mov x25, x4 + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 + ld1 {v4.16b}, [x25], #16 + and x16, x17, x9, asr 63 + extr x11, x9, x8, #63 + eor x10, x16, x8, lsl 1 + and x16, x17, x11, asr 63 + extr x13, x11, x10, #63 + eor x12, x16, x10, lsl 1 + and x16, x17, x13, asr 63 + extr x15, x13, x12, #63 + eor x14, x16, x12, lsl 1 + mov v8.d[0], x8 + mov v8.d[1], x9 + mov v9.d[0], x10 + mov v9.d[1], x11 + mov v10.d[0], x12 + mov v10.d[1], x13 + mov v11.d[0], x14 + mov v11.d[1], x15 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_4: + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x23] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + movi v28.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + ushr v14.16b, v6.16b, #6 + ushr v15.16b, v7.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + shl v2.16b, v6.16b, #2 + shl v3.16b, v7.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + ushr v2.16b, v6.16b, #5 + ushr v3.16b, v7.16b, #5 + pmul v0.16b, v0.16b, v28.16b + pmul v1.16b, v1.16b, v28.16b + pmul v2.16b, v2.16b, v28.16b + pmul v3.16b, v3.16b, v28.16b + shl v28.16b, v4.16b, #3 + shl v29.16b, v5.16b, #3 + shl v30.16b, v6.16b, #3 + shl v31.16b, v7.16b, #3 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + eor v28.16b, v8.16b, v0.16b + eor v29.16b, v9.16b, v1.16b + eor v30.16b, v10.16b, v2.16b + eor v31.16b, v11.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v10.16b, v14.16b, v2.16b + eor v11.16b, v15.16b, v3.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + shl v4.4s, v28.4s, #8 + shl v5.4s, v29.4s, #8 + shl v6.4s, v30.4s, #8 + shl v7.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v4.4s, v28.4s, #24 + sri v5.4s, v29.4s, #24 + sri v6.4s, v30.4s, #24 + sri v7.4s, v31.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + shl v28.4s, v0.4s, #24 + shl v29.4s, v1.4s, #24 + shl v30.4s, v2.4s, #24 + shl v31.4s, v3.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + sri v28.4s, v0.4s, #8 + sri v29.4s, v1.4s, #8 + sri v30.4s, v2.4s, #8 + sri v31.4s, v3.4s, #8 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x21] + # XOR in Key Schedule + ld1 {v0.2d}, [x25], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x23] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + movi v28.16b, #27 + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + sshr v10.16b, v2.16b, #7 + sshr v11.16b, v3.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + shl v14.16b, v2.16b, #1 + shl v15.16b, v3.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v0.16b, #6 + ushr v13.16b, v1.16b, #6 + ushr v14.16b, v2.16b, #6 + ushr v15.16b, v3.16b, #6 + shl v4.16b, v0.16b, #2 + shl v5.16b, v1.16b, #2 + shl v6.16b, v2.16b, #2 + shl v7.16b, v3.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + eor v14.16b, v14.16b, v6.16b + eor v15.16b, v15.16b, v7.16b + ushr v4.16b, v0.16b, #5 + ushr v5.16b, v1.16b, #5 + ushr v6.16b, v2.16b, #5 + ushr v7.16b, v3.16b, #5 + pmul v4.16b, v4.16b, v28.16b + pmul v5.16b, v5.16b, v28.16b + pmul v6.16b, v6.16b, v28.16b + pmul v7.16b, v7.16b, v28.16b + shl v28.16b, v0.16b, #3 + shl v29.16b, v1.16b, #3 + shl v30.16b, v2.16b, #3 + shl v31.16b, v3.16b, #3 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + eor v28.16b, v8.16b, v4.16b + eor v29.16b, v9.16b, v5.16b + eor v30.16b, v10.16b, v6.16b + eor v31.16b, v11.16b, v7.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v6.16b, v6.16b, v2.16b + eor v7.16b, v7.16b, v3.16b + eor v8.16b, v12.16b, v4.16b + eor v9.16b, v13.16b, v5.16b + eor v10.16b, v14.16b, v6.16b + eor v11.16b, v15.16b, v7.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + shl v0.4s, v28.4s, #8 + shl v1.4s, v29.4s, #8 + shl v2.4s, v30.4s, #8 + shl v3.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v0.4s, v28.4s, #24 + sri v1.4s, v29.4s, #24 + sri v2.4s, v30.4s, #24 + sri v3.4s, v31.4s, #24 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + eor v2.16b, v2.16b, v14.16b + eor v3.16b, v3.16b, v15.16b + shl v28.4s, v4.4s, #24 + shl v29.4s, v5.4s, #24 + shl v30.4s, v6.4s, #24 + shl v31.4s, v7.4s, #24 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + sri v28.4s, v4.4s, #8 + sri v29.4s, v5.4s, #8 + sri v30.4s, v6.4s, #8 + sri v31.4s, v7.4s, #8 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x21] + # XOR in Key Schedule + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_4 + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b + tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + eor v10.16b, v2.16b, v12.16b + eor v11.16b, v3.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v13.16b + eor v9.16b, v1.16b, v13.16b + eor v10.16b, v2.16b, v13.16b + eor v11.16b, v3.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + eor v10.16b, v2.16b, v14.16b + eor v11.16b, v3.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + orr v6.16b, v6.16b, v10.16b + orr v7.16b, v7.16b, v11.16b + ld1 {v0.16b}, [x23] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + tbl v6.16b, {v6.16b}, v0.16b + tbl v7.16b, {v7.16b}, v0.16b + movi v28.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + sshr v10.16b, v6.16b, #7 + sshr v11.16b, v7.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + shl v14.16b, v6.16b, #1 + shl v15.16b, v7.16b, #1 + and v8.16b, v8.16b, v28.16b + and v9.16b, v9.16b, v28.16b + and v10.16b, v10.16b, v28.16b + and v11.16b, v11.16b, v28.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + eor v10.16b, v10.16b, v14.16b + eor v11.16b, v11.16b, v15.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + ushr v14.16b, v6.16b, #6 + ushr v15.16b, v7.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + shl v2.16b, v6.16b, #2 + shl v3.16b, v7.16b, #2 + pmul v12.16b, v12.16b, v28.16b + pmul v13.16b, v13.16b, v28.16b + pmul v14.16b, v14.16b, v28.16b + pmul v15.16b, v15.16b, v28.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + ushr v2.16b, v6.16b, #5 + ushr v3.16b, v7.16b, #5 + pmul v0.16b, v0.16b, v28.16b + pmul v1.16b, v1.16b, v28.16b + pmul v2.16b, v2.16b, v28.16b + pmul v3.16b, v3.16b, v28.16b + shl v28.16b, v4.16b, #3 + shl v29.16b, v5.16b, #3 + shl v30.16b, v6.16b, #3 + shl v31.16b, v7.16b, #3 + eor v0.16b, v0.16b, v28.16b + eor v1.16b, v1.16b, v29.16b + eor v2.16b, v2.16b, v30.16b + eor v3.16b, v3.16b, v31.16b + eor v28.16b, v8.16b, v0.16b + eor v29.16b, v9.16b, v1.16b + eor v30.16b, v10.16b, v2.16b + eor v31.16b, v11.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + eor v3.16b, v3.16b, v7.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v10.16b, v14.16b, v2.16b + eor v11.16b, v15.16b, v3.16b + eor v12.16b, v12.16b, v28.16b + eor v13.16b, v13.16b, v29.16b + eor v14.16b, v14.16b, v30.16b + eor v15.16b, v15.16b, v31.16b + eor v28.16b, v28.16b, v4.16b + eor v29.16b, v29.16b, v5.16b + eor v30.16b, v30.16b, v6.16b + eor v31.16b, v31.16b, v7.16b + shl v4.4s, v28.4s, #8 + shl v5.4s, v29.4s, #8 + shl v6.4s, v30.4s, #8 + shl v7.4s, v31.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + sri v4.4s, v28.4s, #24 + sri v5.4s, v29.4s, #24 + sri v6.4s, v30.4s, #24 + sri v7.4s, v31.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + eor v6.16b, v6.16b, v14.16b + eor v7.16b, v7.16b, v15.16b + shl v28.4s, v0.4s, #24 + shl v29.4s, v1.4s, #24 + shl v30.4s, v2.4s, #24 + shl v31.4s, v3.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + eor v6.16b, v6.16b, v10.16b + eor v7.16b, v7.16b, v11.16b + sri v28.4s, v0.4s, #8 + sri v29.4s, v1.4s, #8 + sri v30.4s, v2.4s, #8 + sri v31.4s, v3.4s, #8 + eor v4.16b, v4.16b, v28.16b + eor v5.16b, v5.16b, v29.16b + eor v6.16b, v6.16b, v30.16b + eor v7.16b, v7.16b, v31.16b + ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x21] + # XOR in Key Schedule + ld1 {v0.2d}, [x25], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + eor v6.16b, v6.16b, v0.16b + eor v7.16b, v7.16b, v0.16b + # Round Done + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b + tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + eor v10.16b, v6.16b, v12.16b + eor v11.16b, v7.16b, v12.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b + tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v13.16b + eor v9.16b, v5.16b, v13.16b + eor v10.16b, v6.16b, v13.16b + eor v11.16b, v7.16b, v13.16b + tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + eor v10.16b, v6.16b, v14.16b + eor v11.16b, v7.16b, v14.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + orr v2.16b, v2.16b, v10.16b + orr v3.16b, v3.16b, v11.16b + ld1 {v4.16b}, [x23] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + tbl v2.16b, {v2.16b}, v4.16b + tbl v3.16b, {v3.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + eor v2.16b, v2.16b, v4.16b + eor v3.16b, v3.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + rev32 v2.16b, v2.16b + rev32 v3.16b, v3.16b + mov v8.d[0], x8 + mov v8.d[1], x9 + mov v9.d[0], x10 + mov v9.d[1], x11 + mov v10.d[0], x12 + mov v10.d[1], x13 + mov v11.d[0], x14 + mov v11.d[1], x15 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + eor v2.16b, v2.16b, v10.16b + eor v3.16b, v3.16b, v11.16b + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #0x40 + and x16, x17, x15, asr 63 + extr x9, x15, x14, #63 + eor x8, x16, x14, lsl 1 + sub w2, w2, #0x40 + cmp w2, #0x40 + bge L_AES_XTS_decrypt_NEON_loop_4 + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + movi v15.16b, #27 +L_AES_XTS_decrypt_NEON_start_2: + cmp w2, #32 + blt L_AES_XTS_decrypt_NEON_start_1 + mov x25, x4 + ld1 {v0.16b, v1.16b}, [x0], #32 + ld1 {v4.16b}, [x25], #16 + and x16, x17, x9, asr 63 + extr x11, x9, x8, #63 + eor x10, x16, x8, lsl 1 + and x16, x17, x11, asr 63 + extr x13, x11, x10, #63 + eor x12, x16, x10, lsl 1 + mov v2.d[0], x8 + mov v2.d[1], x9 + mov v3.d[0], x10 + mov v3.d[1], x11 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_2: + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x23] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + movi v10.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + pmul v0.16b, v0.16b, v10.16b + pmul v1.16b, v1.16b, v10.16b + shl v10.16b, v4.16b, #3 + shl v11.16b, v5.16b, #3 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + eor v10.16b, v8.16b, v0.16b + eor v11.16b, v9.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v5.16b + shl v4.4s, v10.4s, #8 + shl v5.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v4.4s, v10.4s, #24 + sri v5.4s, v11.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + shl v10.4s, v0.4s, #24 + shl v11.4s, v1.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + sri v10.4s, v0.4s, #8 + sri v11.4s, v1.4s, #8 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x25], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x23] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + movi v10.16b, #27 + sshr v8.16b, v0.16b, #7 + sshr v9.16b, v1.16b, #7 + shl v12.16b, v0.16b, #1 + shl v13.16b, v1.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v0.16b, #6 + ushr v13.16b, v1.16b, #6 + shl v4.16b, v0.16b, #2 + shl v5.16b, v1.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v4.16b + eor v13.16b, v13.16b, v5.16b + ushr v4.16b, v0.16b, #5 + ushr v5.16b, v1.16b, #5 + pmul v4.16b, v4.16b, v10.16b + pmul v5.16b, v5.16b, v10.16b + shl v10.16b, v0.16b, #3 + shl v11.16b, v1.16b, #3 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v10.16b, v8.16b, v4.16b + eor v11.16b, v9.16b, v5.16b + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v1.16b + eor v8.16b, v12.16b, v4.16b + eor v9.16b, v13.16b, v5.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v0.16b + eor v11.16b, v11.16b, v1.16b + shl v0.4s, v10.4s, #8 + shl v1.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v0.4s, v10.4s, #24 + sri v1.4s, v11.4s, #24 + eor v0.16b, v0.16b, v12.16b + eor v1.16b, v1.16b, v13.16b + shl v10.4s, v4.4s, #24 + shl v11.4s, v5.4s, #24 + eor v0.16b, v0.16b, v8.16b + eor v1.16b, v1.16b, v9.16b + sri v10.4s, v4.4s, #8 + sri v11.4s, v5.4s, #8 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_2 + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v1.16b, v12.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v0.16b, v13.16b + eor v11.16b, v1.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + eor v8.16b, v0.16b, v14.16b + eor v9.16b, v1.16b, v14.16b + orr v4.16b, v4.16b, v10.16b + orr v5.16b, v5.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v4.16b, v4.16b, v8.16b + orr v5.16b, v5.16b, v9.16b + ld1 {v0.16b}, [x23] + tbl v4.16b, {v4.16b}, v0.16b + tbl v5.16b, {v5.16b}, v0.16b + movi v10.16b, #27 + sshr v8.16b, v4.16b, #7 + sshr v9.16b, v5.16b, #7 + shl v12.16b, v4.16b, #1 + shl v13.16b, v5.16b, #1 + and v8.16b, v8.16b, v10.16b + and v9.16b, v9.16b, v10.16b + eor v8.16b, v8.16b, v12.16b + eor v9.16b, v9.16b, v13.16b + ushr v12.16b, v4.16b, #6 + ushr v13.16b, v5.16b, #6 + shl v0.16b, v4.16b, #2 + shl v1.16b, v5.16b, #2 + pmul v12.16b, v12.16b, v10.16b + pmul v13.16b, v13.16b, v10.16b + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + ushr v0.16b, v4.16b, #5 + ushr v1.16b, v5.16b, #5 + pmul v0.16b, v0.16b, v10.16b + pmul v1.16b, v1.16b, v10.16b + shl v10.16b, v4.16b, #3 + shl v11.16b, v5.16b, #3 + eor v0.16b, v0.16b, v10.16b + eor v1.16b, v1.16b, v11.16b + eor v10.16b, v8.16b, v0.16b + eor v11.16b, v9.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v8.16b, v12.16b, v0.16b + eor v9.16b, v13.16b, v1.16b + eor v12.16b, v12.16b, v10.16b + eor v13.16b, v13.16b, v11.16b + eor v10.16b, v10.16b, v4.16b + eor v11.16b, v11.16b, v5.16b + shl v4.4s, v10.4s, #8 + shl v5.4s, v11.4s, #8 + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + sri v4.4s, v10.4s, #24 + sri v5.4s, v11.4s, #24 + eor v4.16b, v4.16b, v12.16b + eor v5.16b, v5.16b, v13.16b + shl v10.4s, v0.4s, #24 + shl v11.4s, v1.4s, #24 + eor v4.16b, v4.16b, v8.16b + eor v5.16b, v5.16b, v9.16b + sri v10.4s, v0.4s, #8 + sri v11.4s, v1.4s, #8 + eor v4.16b, v4.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + # XOR in Key Schedule + ld1 {v0.2d}, [x25], #16 + eor v4.16b, v4.16b, v0.16b + eor v5.16b, v5.16b, v0.16b + # Round Done + movi v12.16b, #0x40 + movi v13.16b, #0x80 + movi v14.16b, #0xc0 + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v5.16b, v12.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b + eor v10.16b, v4.16b, v13.16b + eor v11.16b, v5.16b, v13.16b + tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b + tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + eor v8.16b, v4.16b, v14.16b + eor v9.16b, v5.16b, v14.16b + orr v0.16b, v0.16b, v10.16b + orr v1.16b, v1.16b, v11.16b + tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b + tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b + orr v0.16b, v0.16b, v8.16b + orr v1.16b, v1.16b, v9.16b + ld1 {v4.16b}, [x23] + tbl v0.16b, {v0.16b}, v4.16b + tbl v1.16b, {v1.16b}, v4.16b + # XOR in Key Schedule + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v4.16b + # Round Done + rev32 v0.16b, v0.16b + rev32 v1.16b, v1.16b + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + st1 {v0.16b, v1.16b}, [x1], #32 + and x16, x17, x11, asr 63 + extr x9, x11, x10, #63 + eor x8, x16, x10, lsl 1 + sub w2, w2, #32 +L_AES_XTS_decrypt_NEON_start_1: + ld1 {v3.2d}, [x23] + mov v2.d[0], x8 + mov v2.d[1], x9 + cmp w2, #16 + blt L_AES_XTS_decrypt_NEON_start_partial + mov x25, x4 + ld1 {v0.16b}, [x0], #16 + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v2.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + sshr v10.16b, v0.16b, #7 + ushr v11.16b, v0.16b, #6 + ushr v8.16b, v0.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v0.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v0.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v0.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v0.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v0.16b + shl v0.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v0.4s, v9.4s, #24 + eor v0.16b, v0.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v0.16b, v0.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v0.16b, v0.16b, v9.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v2.16b + st1 {v0.16b}, [x1], #16 + sub w2, w2, #16 + cbz w19, L_AES_XTS_decrypt_NEON_data_done + and x16, x17, x9, asr 63 + extr x9, x9, x8, #63 + eor x8, x16, x8, lsl 1 +L_AES_XTS_decrypt_NEON_start_partial: + mov w2, w19 + cbz w2, L_AES_XTS_decrypt_NEON_data_done + mov v2.d[0], x8 + mov v2.d[1], x9 + and x16, x17, x9, asr 63 + extr x11, x9, x8, #63 + eor x10, x16, x8, lsl 1 + mov v1.d[0], x10 + mov v1.d[1], x11 + mov x25, x4 + ld1 {v0.16b}, [x0], #16 + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v1.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_partial_1: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + sshr v10.16b, v0.16b, #7 + ushr v11.16b, v0.16b, #6 + ushr v8.16b, v0.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v0.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v0.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v0.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v0.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v0.16b + shl v0.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v0.4s, v9.4s, #24 + eor v0.16b, v0.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v0.16b, v0.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v0.16b, v0.16b, v9.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_partial_1 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v1.16b + st1 {v0.2d}, [x6] + add x1, x1, #16 + mov w16, w2 +L_AES_XTS_decrypt_NEON_start_byte: + ldrb w10, [x6] + ldrb w11, [x0], #1 + strb w10, [x1], #1 + strb w11, [x6], #1 + subs w16, w16, #1 + bgt L_AES_XTS_decrypt_NEON_start_byte + sub x1, x1, x2 + sub x6, x6, x2 + sub x1, x1, #16 + mov x25, x4 + ld1 {v0.2d}, [x6] + ld1 {v4.2d}, [x25], #16 + eor v0.16b, v0.16b, v2.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v4.16b + sub w24, w7, #2 +L_AES_XTS_decrypt_NEON_loop_nr_partial_2: + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + sshr v10.16b, v0.16b, #7 + ushr v11.16b, v0.16b, #6 + ushr v8.16b, v0.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v0.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v0.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v0.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v0.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v0.16b + shl v0.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v0.4s, v9.4s, #24 + eor v0.16b, v0.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v0.16b, v0.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v0.16b, v0.16b, v9.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + subs w24, w24, #2 + bne L_AES_XTS_decrypt_NEON_loop_nr_partial_2 + eor v8.16b, v0.16b, v12.16b + eor v9.16b, v0.16b, v13.16b + eor v10.16b, v0.16b, v14.16b + tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v4.16b, v4.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v4.16b, v4.16b, v9.16b + tbl v4.16b, {v4.16b}, v3.16b + sshr v10.16b, v4.16b, #7 + ushr v11.16b, v4.16b, #6 + ushr v8.16b, v4.16b, #5 + and v10.16b, v10.16b, v15.16b + pmul v11.16b, v11.16b, v15.16b + pmul v8.16b, v8.16b, v15.16b + shl v9.16b, v4.16b, #1 + eor v10.16b, v10.16b, v9.16b + shl v9.16b, v4.16b, #3 + eor v8.16b, v8.16b, v9.16b + shl v9.16b, v4.16b, #2 + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v10.16b, v8.16b + eor v8.16b, v8.16b, v4.16b + eor v10.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v9.16b + eor v9.16b, v9.16b, v4.16b + shl v4.4s, v9.4s, #8 + rev32 v10.8h, v10.8h + sri v4.4s, v9.4s, #24 + eor v4.16b, v4.16b, v11.16b + shl v9.4s, v8.4s, #24 + eor v4.16b, v4.16b, v10.16b + sri v9.4s, v8.4s, #8 + eor v4.16b, v4.16b, v9.16b + ld1 {v0.2d}, [x25], #16 + # XOR in Key Schedule + eor v4.16b, v4.16b, v0.16b + eor v8.16b, v4.16b, v12.16b + eor v9.16b, v4.16b, v13.16b + eor v10.16b, v4.16b, v14.16b + tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b + tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b + tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b + tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b + orr v0.16b, v0.16b, v8.16b + orr v9.16b, v9.16b, v10.16b + orr v0.16b, v0.16b, v9.16b + tbl v0.16b, {v0.16b}, v3.16b + ld1 {v4.2d}, [x25], #16 + # XOR in Key Schedule + eor v0.16b, v0.16b, v4.16b + rev32 v0.16b, v0.16b + eor v0.16b, v0.16b, v2.16b + st1 {v0.16b}, [x1] +L_AES_XTS_decrypt_NEON_data_done: + ldp x17, x19, [x29, #16] + ldp x20, x21, [x29, #32] + ldp x22, x23, [x29, #48] + ldp x24, x25, [x29, #64] + ldp d8, d9, [x29, #80] + ldp d10, d11, [x29, #96] + ldp d12, d13, [x29, #112] + ldp d14, d15, [x29, #128] + ldp x29, x30, [sp], #0x90 + ret +#ifndef __APPLE__ + .size AES_XTS_decrypt_NEON,.-AES_XTS_decrypt_NEON +#endif /* __APPLE__ */ +#endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_XTS */ +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP +#ifdef HAVE_AES_DECRYPT +#ifndef __APPLE__ + .text + .type L_AES_ARM64_td, %object + .section .rodata + .size L_AES_ARM64_td, 1024 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_ARM64_td: + .word 0x5051f4a7 + .word 0x537e4165 + .word 0xc31a17a4 + .word 0x963a275e + .word 0xcb3bab6b + .word 0xf11f9d45 + .word 0xabacfa58 + .word 0x934be303 + .word 0x552030fa + .word 0xf6ad766d + .word 0x9188cc76 + .word 0x25f5024c + .word 0xfc4fe5d7 + .word 0xd7c52acb + .word 0x80263544 + .word 0x8fb562a3 + .word 0x49deb15a + .word 0x6725ba1b + .word 0x9845ea0e + .word 0xe15dfec0 + .word 0x02c32f75 + .word 0x12814cf0 + .word 0xa38d4697 + .word 0xc66bd3f9 + .word 0xe7038f5f + .word 0x9515929c + .word 0xebbf6d7a + .word 0xda955259 + .word 0x2dd4be83 + .word 0xd3587421 + .word 0x2949e069 + .word 0x448ec9c8 + .word 0x6a75c289 + .word 0x78f48e79 + .word 0x6b99583e + .word 0xdd27b971 + .word 0xb6bee14f + .word 0x17f088ad + .word 0x66c920ac + .word 0xb47dce3a + .word 0x1863df4a + .word 0x82e51a31 + .word 0x60975133 + .word 0x4562537f + .word 0xe0b16477 + .word 0x84bb6bae + .word 0x1cfe81a0 + .word 0x94f9082b + .word 0x58704868 + .word 0x198f45fd + .word 0x8794de6c + .word 0xb7527bf8 + .word 0x23ab73d3 + .word 0xe2724b02 + .word 0x57e31f8f + .word 0x2a6655ab + .word 0x07b2eb28 + .word 0x032fb5c2 + .word 0x9a86c57b + .word 0xa5d33708 + .word 0xf2302887 + .word 0xb223bfa5 + .word 0xba02036a + .word 0x5ced1682 + .word 0x2b8acf1c + .word 0x92a779b4 + .word 0xf0f307f2 + .word 0xa14e69e2 + .word 0xcd65daf4 + .word 0xd50605be + .word 0x1fd13462 + .word 0x8ac4a6fe + .word 0x9d342e53 + .word 0xa0a2f355 + .word 0x32058ae1 + .word 0x75a4f6eb + .word 0x390b83ec + .word 0xaa4060ef + .word 0x065e719f + .word 0x51bd6e10 + .word 0xf93e218a + .word 0x3d96dd06 + .word 0xaedd3e05 + .word 0x464de6bd + .word 0xb591548d + .word 0x0571c45d + .word 0x6f0406d4 + .word 0xff605015 + .word 0x241998fb + .word 0x97d6bde9 + .word 0xcc894043 + .word 0x7767d99e + .word 0xbdb0e842 + .word 0x8807898b + .word 0x38e7195b + .word 0xdb79c8ee + .word 0x47a17c0a + .word 0xe97c420f + .word 0xc9f8841e + .word 0x00000000 + .word 0x83098086 + .word 0x48322bed + .word 0xac1e1170 + .word 0x4e6c5a72 + .word 0xfbfd0eff + .word 0x560f8538 + .word 0x1e3daed5 + .word 0x27362d39 + .word 0x640a0fd9 + .word 0x21685ca6 + .word 0xd19b5b54 + .word 0x3a24362e + .word 0xb10c0a67 + .word 0x0f9357e7 + .word 0xd2b4ee96 + .word 0x9e1b9b91 + .word 0x4f80c0c5 + .word 0xa261dc20 + .word 0x695a774b + .word 0x161c121a + .word 0x0ae293ba + .word 0xe5c0a02a + .word 0x433c22e0 + .word 0x1d121b17 + .word 0x0b0e090d + .word 0xadf28bc7 + .word 0xb92db6a8 + .word 0xc8141ea9 + .word 0x8557f119 + .word 0x4caf7507 + .word 0xbbee99dd + .word 0xfda37f60 + .word 0x9ff70126 + .word 0xbc5c72f5 + .word 0xc544663b + .word 0x345bfb7e + .word 0x768b4329 + .word 0xdccb23c6 + .word 0x68b6edfc + .word 0x63b8e4f1 + .word 0xcad731dc + .word 0x10426385 + .word 0x40139722 + .word 0x2084c611 + .word 0x7d854a24 + .word 0xf8d2bb3d + .word 0x11aef932 + .word 0x6dc729a1 + .word 0x4b1d9e2f + .word 0xf3dcb230 + .word 0xec0d8652 + .word 0xd077c1e3 + .word 0x6c2bb316 + .word 0x99a970b9 + .word 0xfa119448 + .word 0x2247e964 + .word 0xc4a8fc8c + .word 0x1aa0f03f + .word 0xd8567d2c + .word 0xef223390 + .word 0xc787494e + .word 0xc1d938d1 + .word 0xfe8ccaa2 + .word 0x3698d40b + .word 0xcfa6f581 + .word 0x28a57ade + .word 0x26dab78e + .word 0xa43fadbf + .word 0xe42c3a9d + .word 0x0d507892 + .word 0x9b6a5fcc + .word 0x62547e46 + .word 0xc2f68d13 + .word 0xe890d8b8 + .word 0x5e2e39f7 + .word 0xf582c3af + .word 0xbe9f5d80 + .word 0x7c69d093 + .word 0xa96fd52d + .word 0xb3cf2512 + .word 0x3bc8ac99 + .word 0xa710187d + .word 0x6ee89c63 + .word 0x7bdb3bbb + .word 0x09cd2678 + .word 0xf46e5918 + .word 0x01ec9ab7 + .word 0xa8834f9a + .word 0x65e6956e + .word 0x7eaaffe6 + .word 0x0821bccf + .word 0xe6ef15e8 + .word 0xd9bae79b + .word 0xce4a6f36 + .word 0xd4ea9f09 + .word 0xd629b07c + .word 0xaf31a4b2 + .word 0x312a3f23 + .word 0x30c6a594 + .word 0xc035a266 + .word 0x37744ebc + .word 0xa6fc82ca + .word 0xb0e090d0 + .word 0x1533a7d8 + .word 0x4af10498 + .word 0xf741ecda + .word 0x0e7fcd50 + .word 0x2f1791f6 + .word 0x8d764dd6 + .word 0x4d43efb0 + .word 0x54ccaa4d + .word 0xdfe49604 + .word 0xe39ed1b5 + .word 0x1b4c6a88 + .word 0xb8c12c1f + .word 0x7f466551 + .word 0x049d5eea + .word 0x5d018c35 + .word 0x73fa8774 + .word 0x2efb0b41 + .word 0x5ab3671d + .word 0x5292dbd2 + .word 0x33e91056 + .word 0x136dd647 + .word 0x8c9ad761 + .word 0x7a37a10c + .word 0x8e59f814 + .word 0x89eb133c + .word 0xeecea927 + .word 0x35b761c9 + .word 0xede11ce5 + .word 0x3c7a47b1 + .word 0x599cd2df + .word 0x3f55f273 + .word 0x791814ce + .word 0xbf73c737 + .word 0xea53f7cd + .word 0x5b5ffdaa + .word 0x14df3d6f + .word 0x867844db + .word 0x81caaff3 + .word 0x3eb968c4 + .word 0x2c382434 + .word 0x5fc2a340 + .word 0x72161dc3 + .word 0x0cbce225 + .word 0x8b283c49 + .word 0x41ff0d95 + .word 0x7139a801 + .word 0xde080cb3 + .word 0x9cd8b4e4 + .word 0x906456c1 + .word 0x617bcb84 + .word 0x70d532b6 + .word 0x74486c5c + .word 0x42d0b857 +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +#ifndef __APPLE__ + .text + .type L_AES_ARM64_te, %object + .section .rodata + .size L_AES_ARM64_te, 1024 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_ARM64_te: + .word 0xa5c66363 + .word 0x84f87c7c + .word 0x99ee7777 + .word 0x8df67b7b + .word 0x0dfff2f2 + .word 0xbdd66b6b + .word 0xb1de6f6f + .word 0x5491c5c5 + .word 0x50603030 + .word 0x03020101 + .word 0xa9ce6767 + .word 0x7d562b2b + .word 0x19e7fefe + .word 0x62b5d7d7 + .word 0xe64dabab + .word 0x9aec7676 + .word 0x458fcaca + .word 0x9d1f8282 + .word 0x4089c9c9 + .word 0x87fa7d7d + .word 0x15effafa + .word 0xebb25959 + .word 0xc98e4747 + .word 0x0bfbf0f0 + .word 0xec41adad + .word 0x67b3d4d4 + .word 0xfd5fa2a2 + .word 0xea45afaf + .word 0xbf239c9c + .word 0xf753a4a4 + .word 0x96e47272 + .word 0x5b9bc0c0 + .word 0xc275b7b7 + .word 0x1ce1fdfd + .word 0xae3d9393 + .word 0x6a4c2626 + .word 0x5a6c3636 + .word 0x417e3f3f + .word 0x02f5f7f7 + .word 0x4f83cccc + .word 0x5c683434 + .word 0xf451a5a5 + .word 0x34d1e5e5 + .word 0x08f9f1f1 + .word 0x93e27171 + .word 0x73abd8d8 + .word 0x53623131 + .word 0x3f2a1515 + .word 0x0c080404 + .word 0x5295c7c7 + .word 0x65462323 + .word 0x5e9dc3c3 + .word 0x28301818 + .word 0xa1379696 + .word 0x0f0a0505 + .word 0xb52f9a9a + .word 0x090e0707 + .word 0x36241212 + .word 0x9b1b8080 + .word 0x3ddfe2e2 + .word 0x26cdebeb + .word 0x694e2727 + .word 0xcd7fb2b2 + .word 0x9fea7575 + .word 0x1b120909 + .word 0x9e1d8383 + .word 0x74582c2c + .word 0x2e341a1a + .word 0x2d361b1b + .word 0xb2dc6e6e + .word 0xeeb45a5a + .word 0xfb5ba0a0 + .word 0xf6a45252 + .word 0x4d763b3b + .word 0x61b7d6d6 + .word 0xce7db3b3 + .word 0x7b522929 + .word 0x3edde3e3 + .word 0x715e2f2f + .word 0x97138484 + .word 0xf5a65353 + .word 0x68b9d1d1 + .word 0x00000000 + .word 0x2cc1eded + .word 0x60402020 + .word 0x1fe3fcfc + .word 0xc879b1b1 + .word 0xedb65b5b + .word 0xbed46a6a + .word 0x468dcbcb + .word 0xd967bebe + .word 0x4b723939 + .word 0xde944a4a + .word 0xd4984c4c + .word 0xe8b05858 + .word 0x4a85cfcf + .word 0x6bbbd0d0 + .word 0x2ac5efef + .word 0xe54faaaa + .word 0x16edfbfb + .word 0xc5864343 + .word 0xd79a4d4d + .word 0x55663333 + .word 0x94118585 + .word 0xcf8a4545 + .word 0x10e9f9f9 + .word 0x06040202 + .word 0x81fe7f7f + .word 0xf0a05050 + .word 0x44783c3c + .word 0xba259f9f + .word 0xe34ba8a8 + .word 0xf3a25151 + .word 0xfe5da3a3 + .word 0xc0804040 + .word 0x8a058f8f + .word 0xad3f9292 + .word 0xbc219d9d + .word 0x48703838 + .word 0x04f1f5f5 + .word 0xdf63bcbc + .word 0xc177b6b6 + .word 0x75afdada + .word 0x63422121 + .word 0x30201010 + .word 0x1ae5ffff + .word 0x0efdf3f3 + .word 0x6dbfd2d2 + .word 0x4c81cdcd + .word 0x14180c0c + .word 0x35261313 + .word 0x2fc3ecec + .word 0xe1be5f5f + .word 0xa2359797 + .word 0xcc884444 + .word 0x392e1717 + .word 0x5793c4c4 + .word 0xf255a7a7 + .word 0x82fc7e7e + .word 0x477a3d3d + .word 0xacc86464 + .word 0xe7ba5d5d + .word 0x2b321919 + .word 0x95e67373 + .word 0xa0c06060 + .word 0x98198181 + .word 0xd19e4f4f + .word 0x7fa3dcdc + .word 0x66442222 + .word 0x7e542a2a + .word 0xab3b9090 + .word 0x830b8888 + .word 0xca8c4646 + .word 0x29c7eeee + .word 0xd36bb8b8 + .word 0x3c281414 + .word 0x79a7dede + .word 0xe2bc5e5e + .word 0x1d160b0b + .word 0x76addbdb + .word 0x3bdbe0e0 + .word 0x56643232 + .word 0x4e743a3a + .word 0x1e140a0a + .word 0xdb924949 + .word 0x0a0c0606 + .word 0x6c482424 + .word 0xe4b85c5c + .word 0x5d9fc2c2 + .word 0x6ebdd3d3 + .word 0xef43acac + .word 0xa6c46262 + .word 0xa8399191 + .word 0xa4319595 + .word 0x37d3e4e4 + .word 0x8bf27979 + .word 0x32d5e7e7 + .word 0x438bc8c8 + .word 0x596e3737 + .word 0xb7da6d6d + .word 0x8c018d8d + .word 0x64b1d5d5 + .word 0xd29c4e4e + .word 0xe049a9a9 + .word 0xb4d86c6c + .word 0xfaac5656 + .word 0x07f3f4f4 + .word 0x25cfeaea + .word 0xafca6565 + .word 0x8ef47a7a + .word 0xe947aeae + .word 0x18100808 + .word 0xd56fbaba + .word 0x88f07878 + .word 0x6f4a2525 + .word 0x725c2e2e + .word 0x24381c1c + .word 0xf157a6a6 + .word 0xc773b4b4 + .word 0x5197c6c6 + .word 0x23cbe8e8 + .word 0x7ca1dddd + .word 0x9ce87474 + .word 0x213e1f1f + .word 0xdd964b4b + .word 0xdc61bdbd + .word 0x860d8b8b + .word 0x850f8a8a + .word 0x90e07070 + .word 0x427c3e3e + .word 0xc471b5b5 + .word 0xaacc6666 + .word 0xd8904848 + .word 0x05060303 + .word 0x01f7f6f6 + .word 0x121c0e0e + .word 0xa3c26161 + .word 0x5f6a3535 + .word 0xf9ae5757 + .word 0xd069b9b9 + .word 0x91178686 + .word 0x5899c1c1 + .word 0x273a1d1d + .word 0xb9279e9e + .word 0x38d9e1e1 + .word 0x13ebf8f8 + .word 0xb32b9898 + .word 0x33221111 + .word 0xbbd26969 + .word 0x70a9d9d9 + .word 0x89078e8e + .word 0xa7339494 + .word 0xb62d9b9b + .word 0x223c1e1e + .word 0x92158787 + .word 0x20c9e9e9 + .word 0x4987cece + .word 0xffaa5555 + .word 0x78502828 + .word 0x7aa5dfdf + .word 0x8f038c8c + .word 0xf859a1a1 + .word 0x80098989 + .word 0x171a0d0d + .word 0xda65bfbf + .word 0x31d7e6e6 + .word 0xc6844242 + .word 0xb8d06868 + .word 0xc3824141 + .word 0xb0299999 + .word 0x775a2d2d + .word 0x111e0f0f + .word 0xcb7bb0b0 + .word 0xfca85454 + .word 0xd66dbbbb + .word 0x3a2c1616 +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#ifndef __APPLE__ +.text +.globl AES_invert_key +.type AES_invert_key,@function +.align 2 +AES_invert_key: +#else +.section __TEXT,__text +.globl _AES_invert_key +.p2align 2 +_AES_invert_key: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x2, L_AES_ARM64_te + add x2, x2, :lo12:L_AES_ARM64_te +#else + adrp x2, L_AES_ARM64_te@PAGE + add x2, x2, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_AES_ARM64_td + add x3, x3, :lo12:L_AES_ARM64_td +#else + adrp x3, L_AES_ARM64_td@PAGE + add x3, x3, :lo12:L_AES_ARM64_td@PAGEOFF +#endif /* __APPLE__ */ + add x12, x0, x1, lsl 4 + mov w13, w1 +L_AES_invert_key_loop: + ldp w4, w5, [x0] + ldnp w6, w7, [x0, #8] + ldp w8, w9, [x12] + ldnp w10, w11, [x12, #8] + stp w4, w5, [x12] + stnp w6, w7, [x12, #8] + stp w8, w9, [x0], #8 + stp w10, w11, [x0], #8 + subs w13, w13, #2 + sub x12, x12, #16 + bne L_AES_invert_key_loop + sub x0, x0, x1, lsl 3 + add x0, x0, #16 + sub w13, w1, #1 +L_AES_invert_key_mix_loop: + ldp w4, w5, [x0] + ldnp w6, w7, [x0, #8] + ubfx w8, w4, #0, #8 + ubfx w9, w4, #8, #8 + ubfx w10, w4, #16, #8 + ubfx w11, w4, #24, #8 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w11, w11, #2 + ldrb w8, [x2, x8, LSL 0] + ldrb w9, [x2, x9, LSL 0] + ldrb w10, [x2, x10, LSL 0] + ldrb w11, [x2, x11, LSL 0] + ldr w8, [x3, x8, LSL 2] + ldr w9, [x3, x9, LSL 2] + ldr w10, [x3, x10, LSL 2] + ldr w11, [x3, x11, LSL 2] + eor w10, w10, w8, ror 16 + eor w10, w10, w9, ror 8 + eor w10, w10, w11, ror 24 + str w10, [x0], #4 + ubfx w8, w5, #0, #8 + ubfx w9, w5, #8, #8 + ubfx w10, w5, #16, #8 + ubfx w11, w5, #24, #8 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w11, w11, #2 + ldrb w8, [x2, x8, LSL 0] + ldrb w9, [x2, x9, LSL 0] + ldrb w10, [x2, x10, LSL 0] + ldrb w11, [x2, x11, LSL 0] + ldr w8, [x3, x8, LSL 2] + ldr w9, [x3, x9, LSL 2] + ldr w10, [x3, x10, LSL 2] + ldr w11, [x3, x11, LSL 2] + eor w10, w10, w8, ror 16 + eor w10, w10, w9, ror 8 + eor w10, w10, w11, ror 24 + str w10, [x0], #4 + ubfx w8, w6, #0, #8 + ubfx w9, w6, #8, #8 + ubfx w10, w6, #16, #8 + ubfx w11, w6, #24, #8 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w11, w11, #2 + ldrb w8, [x2, x8, LSL 0] + ldrb w9, [x2, x9, LSL 0] + ldrb w10, [x2, x10, LSL 0] + ldrb w11, [x2, x11, LSL 0] + ldr w8, [x3, x8, LSL 2] + ldr w9, [x3, x9, LSL 2] + ldr w10, [x3, x10, LSL 2] + ldr w11, [x3, x11, LSL 2] + eor w10, w10, w8, ror 16 + eor w10, w10, w9, ror 8 + eor w10, w10, w11, ror 24 + str w10, [x0], #4 + ubfx w8, w7, #0, #8 + ubfx w9, w7, #8, #8 + ubfx w10, w7, #16, #8 + ubfx w11, w7, #24, #8 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w11, w11, #2 + ldrb w8, [x2, x8, LSL 0] + ldrb w9, [x2, x9, LSL 0] + ldrb w10, [x2, x10, LSL 0] + ldrb w11, [x2, x11, LSL 0] + ldr w8, [x3, x8, LSL 2] + ldr w9, [x3, x9, LSL 2] + ldr w10, [x3, x10, LSL 2] + ldr w11, [x3, x11, LSL 2] + eor w10, w10, w8, ror 16 + eor w10, w10, w9, ror 8 + eor w10, w10, w11, ror 24 + str w10, [x0], #4 + subs w13, w13, #1 + bne L_AES_invert_key_mix_loop + ret +#ifndef __APPLE__ + .size AES_invert_key,.-AES_invert_key +#endif /* __APPLE__ */ +#endif /* HAVE_AES_DECRYPT */ +#ifndef __APPLE__ + .text + .type L_AES_ARM64_rcon, %object + .section .rodata + .size L_AES_ARM64_rcon, 40 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_AES_ARM64_rcon: + .word 0x01000000 + .word 0x02000000 + .word 0x04000000 + .word 0x08000000 + .word 0x10000000 + .word 0x20000000 + .word 0x40000000 + .word 0x80000000 + .word 0x1b000000 + .word 0x36000000 +#ifndef __APPLE__ +.text +.globl AES_set_encrypt_key +.type AES_set_encrypt_key,@function +.align 2 +AES_set_encrypt_key: +#else +.section __TEXT,__text +.globl _AES_set_encrypt_key +.p2align 2 +_AES_set_encrypt_key: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_rcon + add x5, x5, :lo12:L_AES_ARM64_rcon +#else + adrp x5, L_AES_ARM64_rcon@PAGE + add x5, x5, :lo12:L_AES_ARM64_rcon@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x12, L_AES_ARM64_te + add x12, x12, :lo12:L_AES_ARM64_te +#else + adrp x12, L_AES_ARM64_te@PAGE + add x12, x12, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + cmp x1, #0x80 + beq L_AES_set_encrypt_key_start_128 + cmp x1, #0xc0 + beq L_AES_set_encrypt_key_start_192 + ldr w6, [x0] + ldr w7, [x0, #4] + ldr w8, [x0, #8] + ldr w9, [x0, #12] + rev w6, w6 + rev w7, w7 + rev w8, w8 + rev w9, w9 + stp w6, w7, [x2], #8 + stp w8, w9, [x2], #8 + ldr w6, [x0, #16] + ldr w7, [x0, #20] + ldr w8, [x0, #24] + ldr w9, [x0, #28] + rev w6, w6 + rev w7, w7 + rev w8, w8 + rev w9, w9 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + sub x2, x2, #16 + mov x4, #6 +L_AES_set_encrypt_key_loop_256: + ubfx w6, w9, #0, #8 + ubfx w7, w9, #8, #8 + ubfx w8, w9, #16, #8 + ubfx w9, w9, #24, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w9, w9, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w9, [x12, x9, LSL 0] + eor w3, w9, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + eor w6, w6, w3 + ldr w3, [x5], #4 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + add x2, x2, #16 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + sub x2, x2, #16 + mov w3, w9 + ubfx w6, w3, #8, #8 + ubfx w7, w3, #16, #8 + ubfx w8, w3, #24, #8 + ubfx w3, w3, #0, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w3, w3, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w3, [x12, x3, LSL 0] + eor w3, w3, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + add x2, x2, #16 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + sub x2, x2, #16 + subs x4, x4, #1 + bne L_AES_set_encrypt_key_loop_256 + ubfx w6, w9, #0, #8 + ubfx w7, w9, #8, #8 + ubfx w8, w9, #16, #8 + ubfx w9, w9, #24, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w9, w9, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w9, [x12, x9, LSL 0] + eor w3, w9, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + eor w6, w6, w3 + ldr w3, [x5], #4 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + add x2, x2, #16 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + sub x2, x2, #16 + b L_AES_set_encrypt_key_end +L_AES_set_encrypt_key_start_192: + ldr w6, [x0] + ldr w7, [x0, #4] + ldr w8, [x0, #8] + ldr w9, [x0, #12] + ldr w10, [x0, #16] + ldr w11, [x0, #20] + rev w6, w6 + rev w7, w7 + rev w8, w8 + rev w9, w9 + rev w10, w10 + rev w11, w11 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + stnp w10, w11, [x2, #16] + mov x4, #7 +L_AES_set_encrypt_key_loop_192: + ubfx w6, w11, #0, #8 + ubfx w7, w11, #8, #8 + ubfx w8, w11, #16, #8 + ubfx w11, w11, #24, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w11, w11, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w11, [x12, x11, LSL 0] + eor w3, w11, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + ldp w10, w11, [x2], #8 + eor w6, w6, w3 + ldr w3, [x5], #4 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + eor w10, w10, w9 + eor w11, w11, w10 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + stnp w10, w11, [x2, #16] + subs x4, x4, #1 + bne L_AES_set_encrypt_key_loop_192 + ubfx w6, w11, #0, #8 + ubfx w7, w11, #8, #8 + ubfx w8, w11, #16, #8 + ubfx w11, w11, #24, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w11, w11, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w11, [x12, x11, LSL 0] + eor w3, w11, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + ldp w10, w11, [x2], #8 + eor w6, w6, w3 + ldr w3, [x5], #4 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + b L_AES_set_encrypt_key_end +L_AES_set_encrypt_key_start_128: + ldr w6, [x0] + ldr w7, [x0, #4] + ldr w8, [x0, #8] + ldr w9, [x0, #12] + rev w6, w6 + rev w7, w7 + rev w8, w8 + rev w9, w9 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + mov x4, #10 +L_AES_set_encrypt_key_loop_128: + ubfx w6, w9, #0, #8 + ubfx w7, w9, #8, #8 + ubfx w8, w9, #16, #8 + ubfx w9, w9, #24, #8 + lsl w6, w6, #2 + lsl w7, w7, #2 + lsl w8, w8, #2 + lsl w9, w9, #2 + ldrb w6, [x12, x6, LSL 0] + ldrb w7, [x12, x7, LSL 0] + ldrb w8, [x12, x8, LSL 0] + ldrb w9, [x12, x9, LSL 0] + eor w3, w9, w6, lsl 8 + eor w3, w3, w7, lsl 16 + eor w3, w3, w8, lsl 24 + ldp w6, w7, [x2], #8 + ldp w8, w9, [x2], #8 + eor w6, w6, w3 + ldr w3, [x5], #4 + eor w6, w6, w3 + eor w7, w7, w6 + eor w8, w8, w7 + eor w9, w9, w8 + stp w6, w7, [x2] + stnp w8, w9, [x2, #8] + subs x4, x4, #1 + bne L_AES_set_encrypt_key_loop_128 +L_AES_set_encrypt_key_end: + ret +#ifndef __APPLE__ + .size AES_set_encrypt_key,.-AES_set_encrypt_key +#endif /* __APPLE__ */ +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_ECB) +#ifndef __APPLE__ +.text +.globl AES_ECB_encrypt +.type AES_ECB_encrypt,@function +.align 2 +AES_ECB_encrypt: +#else +.section __TEXT,__text +.globl _AES_ECB_encrypt +.p2align 2 +_AES_ECB_encrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-32]! + add x29, sp, #0 + str x17, [x29, #24] +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_te + add x5, x5, :lo12:L_AES_ARM64_te +#else + adrp x5, L_AES_ARM64_te@PAGE + add x5, x5, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ +L_AES_ECB_encrypt_loop_block_128: + mov x17, x3 + ldr x6, [x0] + ldr x7, [x0, #8] + rev32 x6, x6 + rev32 x7, x7 + ldp x10, x11, [x17], #16 + # Round: 0 - XOR in key schedule + eor x6, x6, x10 + eor x7, x7, x11 + sub w16, w4, #2 +L_AES_ECB_encrypt_loop_nr: + ubfx x10, x6, #48, #8 + ubfx x13, x6, #24, #8 + ubfx x14, x7, #8, #8 + ubfx x15, x7, #32, #8 + ldr x8, [x5] + ldr x8, [x5, #64] + ldr x8, [x5, #128] + ldr x8, [x5, #192] + ldr x8, [x5, #256] + ldr x8, [x5, #320] + ldr x8, [x5, #384] + ldr x8, [x5, #448] + ldr x8, [x5, #512] + ldr x8, [x5, #576] + ldr x8, [x5, #640] + ldr x8, [x5, #704] + ldr x8, [x5, #768] + ldr x8, [x5, #832] + ldr x8, [x5, #896] + ldr x8, [x5, #960] + ldr w10, [x5, x10, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x11, x7, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x6, #56, #8 + eor w10, w10, w14, ror 8 + ubfx x14, x7, #40, #8 + eor w10, w10, w15, ror 16 + ubfx x15, x6, #0, #8 + ldr w11, [x5, x11, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x12, x7, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x7, #24, #8 + eor w11, w11, w14, ror 8 + ubfx x14, x6, #8, #8 + eor w11, w11, w15, ror 16 + ubfx x15, x6, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x5, x12, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x8, x7, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x6, #16, #8 + eor w12, w12, w14, ror 8 + ubfx x14, x7, #56, #8 + eor w11, w12, w15, ror 16 + ubfx x15, x6, #40, #8 + ldr w8, [x5, x8, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w15, [x5, x15, LSL 2] + eor w14, w14, w8, ror 24 + ldp x6, x7, [x17], #16 + eor w13, w13, w14, ror 24 + eor w13, w13, w15, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x6 + eor x11, x11, x7 + ubfx x6, x10, #48, #8 + ubfx x9, x10, #24, #8 + ubfx x14, x11, #8, #8 + ubfx x15, x11, #32, #8 + ldr x12, [x5] + ldr x12, [x5, #64] + ldr x12, [x5, #128] + ldr x12, [x5, #192] + ldr x12, [x5, #256] + ldr x12, [x5, #320] + ldr x12, [x5, #384] + ldr x12, [x5, #448] + ldr x12, [x5, #512] + ldr x12, [x5, #576] + ldr x12, [x5, #640] + ldr x12, [x5, #704] + ldr x12, [x5, #768] + ldr x12, [x5, #832] + ldr x12, [x5, #896] + ldr x12, [x5, #960] + ldr w6, [x5, x6, LSL 2] + ldr w9, [x5, x9, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x7, x11, #16, #8 + eor w6, w6, w9, ror 24 + ubfx x9, x10, #56, #8 + eor w6, w6, w14, ror 8 + ubfx x14, x11, #40, #8 + eor w6, w6, w15, ror 16 + ubfx x15, x10, #0, #8 + ldr w7, [x5, x7, LSL 2] + ldr w9, [x5, x9, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x8, x11, #48, #8 + eor w7, w7, w9, ror 24 + ubfx x9, x11, #24, #8 + eor w7, w7, w14, ror 8 + ubfx x14, x10, #8, #8 + eor w7, w7, w15, ror 16 + ubfx x15, x10, #32, #8 + bfi x6, x7, #32, #32 + ldr w8, [x5, x8, LSL 2] + ldr w9, [x5, x9, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x12, x11, #0, #8 + eor w8, w8, w9, ror 24 + ubfx x9, x10, #16, #8 + eor w8, w8, w14, ror 8 + ubfx x14, x11, #56, #8 + eor w7, w8, w15, ror 16 + ubfx x15, x10, #40, #8 + ldr w12, [x5, x12, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w9, [x5, x9, LSL 2] + ldr w15, [x5, x15, LSL 2] + eor w14, w14, w12, ror 24 + ldp x10, x11, [x17], #16 + eor w9, w9, w14, ror 24 + eor w9, w9, w15, ror 8 + bfi x7, x9, #32, #32 + # XOR in Key Schedule + eor x6, x6, x10 + eor x7, x7, x11 + subs w16, w16, #2 + bne L_AES_ECB_encrypt_loop_nr + ubfx x10, x6, #48, #8 + ubfx x13, x6, #24, #8 + ubfx x14, x7, #8, #8 + ubfx x15, x7, #32, #8 + ldr x8, [x5] + ldr x8, [x5, #64] + ldr x8, [x5, #128] + ldr x8, [x5, #192] + ldr x8, [x5, #256] + ldr x8, [x5, #320] + ldr x8, [x5, #384] + ldr x8, [x5, #448] + ldr x8, [x5, #512] + ldr x8, [x5, #576] + ldr x8, [x5, #640] + ldr x8, [x5, #704] + ldr x8, [x5, #768] + ldr x8, [x5, #832] + ldr x8, [x5, #896] + ldr x8, [x5, #960] + ldr w10, [x5, x10, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x11, x7, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x6, #56, #8 + eor w10, w10, w14, ror 8 + ubfx x14, x7, #40, #8 + eor w10, w10, w15, ror 16 + ubfx x15, x6, #0, #8 + ldr w11, [x5, x11, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x12, x7, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x7, #24, #8 + eor w11, w11, w14, ror 8 + ubfx x14, x6, #8, #8 + eor w11, w11, w15, ror 16 + ubfx x15, x6, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x5, x12, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ubfx x8, x7, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x6, #16, #8 + eor w12, w12, w14, ror 8 + ubfx x14, x7, #56, #8 + eor w11, w12, w15, ror 16 + ubfx x15, x6, #40, #8 + ldr w8, [x5, x8, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w13, [x5, x13, LSL 2] + ldr w15, [x5, x15, LSL 2] + eor w14, w14, w8, ror 24 + ldp x6, x7, [x17], #16 + eor w13, w13, w14, ror 24 + eor w13, w13, w15, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x6 + eor x11, x11, x7 + ubfx x6, x11, #32, #8 + ubfx x9, x11, #8, #8 + ubfx x14, x10, #48, #8 + ubfx x15, x10, #24, #8 + lsl w6, w6, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldr x13, [x5] + ldr x13, [x5, #64] + ldr x13, [x5, #128] + ldr x13, [x5, #192] + ldr x13, [x5, #256] + ldr x13, [x5, #320] + ldr x13, [x5, #384] + ldr x13, [x5, #448] + ldr x13, [x5, #512] + ldr x13, [x5, #576] + ldr x13, [x5, #640] + ldr x13, [x5, #704] + ldr x13, [x5, #768] + ldr x13, [x5, #832] + ldr x13, [x5, #896] + ldr x13, [x5, #960] + ldrb w6, [x5, x6, LSL 0] + ldrb w9, [x5, x9, LSL 0] + ldrb w14, [x5, x14, LSL 0] + ldrb w15, [x5, x15, LSL 0] + ubfx x7, x10, #0, #8 + eor w6, w6, w9, lsl 8 + ubfx x9, x11, #40, #8 + eor w6, w6, w14, lsl 16 + ubfx x14, x11, #16, #8 + eor w6, w6, w15, lsl 24 + ubfx x15, x10, #56, #8 + lsl w7, w7, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w7, [x5, x7, LSL 0] + ldrb w9, [x5, x9, LSL 0] + ldrb w14, [x5, x14, LSL 0] + ldrb w15, [x5, x15, LSL 0] + ubfx x8, x10, #32, #8 + eor w7, w7, w9, lsl 8 + ubfx x9, x10, #8, #8 + eor w7, w7, w14, lsl 16 + ubfx x14, x11, #48, #8 + eor w7, w7, w15, lsl 24 + ubfx x15, x11, #24, #8 + bfi x6, x7, #32, #32 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w8, [x5, x8, LSL 0] + ldrb w9, [x5, x9, LSL 0] + ldrb w14, [x5, x14, LSL 0] + ldrb w15, [x5, x15, LSL 0] + ubfx x13, x11, #56, #8 + eor w8, w8, w9, lsl 8 + ubfx x9, x11, #0, #8 + eor w8, w8, w14, lsl 16 + ubfx x14, x10, #40, #8 + eor w7, w8, w15, lsl 24 + ubfx x15, x10, #16, #8 + lsl w13, w13, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w13, [x5, x13, LSL 0] + ldrb w9, [x5, x9, LSL 0] + ldrb w14, [x5, x14, LSL 0] + ldrb w15, [x5, x15, LSL 0] + eor w14, w14, w13, lsl 16 + ldp x10, x11, [x17] + eor w9, w9, w14, lsl 8 + eor w9, w9, w15, lsl 16 + bfi x7, x9, #32, #32 + # XOR in Key Schedule + eor x6, x6, x10 + eor x7, x7, x11 + rev32 x6, x6 + rev32 x7, x7 + str x6, [x1] + str x7, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_ECB_encrypt_loop_block_128 + ldr x17, [x29, #24] + ldp x29, x30, [sp], #32 + ret +#ifndef __APPLE__ + .size AES_ECB_encrypt,.-AES_ECB_encrypt +#endif /* __APPLE__ */ +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || + * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */ +#ifdef HAVE_AES_CBC +#ifndef __APPLE__ +.text +.globl AES_CBC_encrypt +.type AES_CBC_encrypt,@function +.align 2 +AES_CBC_encrypt: +#else +.section __TEXT,__text +.globl _AES_CBC_encrypt +.p2align 2 +_AES_CBC_encrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-32]! + add x29, sp, #0 + stp x17, x19, [x29, #16] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_te + add x6, x6, :lo12:L_AES_ARM64_te +#else + adrp x6, L_AES_ARM64_te@PAGE + add x6, x6, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + ldp x7, x8, [x5] +L_AES_CBC_encrypt_loop_block: + mov x19, x3 + ldr x11, [x0] + ldr x12, [x0, #8] + eor x7, x7, x11 + eor x8, x8, x12 + rev32 x7, x7 + rev32 x8, x8 + ldp x11, x12, [x19], #16 + # Round: 0 - XOR in key schedule + eor x7, x7, x11 + eor x8, x8, x12 + sub w17, w4, #2 +L_AES_CBC_encrypt_loop_nr: + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x15, x8, #8, #8 + ubfx x16, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w15, ror 8 + ubfx x15, x8, #40, #8 + eor w11, w11, w16, ror 16 + ubfx x16, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w15, ror 8 + ubfx x15, x7, #8, #8 + eor w12, w12, w16, ror 16 + ubfx x16, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w15, ror 8 + ubfx x15, x8, #56, #8 + eor w12, w13, w16, ror 16 + ubfx x16, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w16, [x6, x16, LSL 2] + eor w15, w15, w9, ror 24 + ldp x7, x8, [x19], #16 + eor w14, w14, w15, ror 24 + eor w14, w14, w16, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x11, #48, #8 + ubfx x10, x11, #24, #8 + ubfx x15, x12, #8, #8 + ubfx x16, x12, #32, #8 + ldr x13, [x6] + ldr x13, [x6, #64] + ldr x13, [x6, #128] + ldr x13, [x6, #192] + ldr x13, [x6, #256] + ldr x13, [x6, #320] + ldr x13, [x6, #384] + ldr x13, [x6, #448] + ldr x13, [x6, #512] + ldr x13, [x6, #576] + ldr x13, [x6, #640] + ldr x13, [x6, #704] + ldr x13, [x6, #768] + ldr x13, [x6, #832] + ldr x13, [x6, #896] + ldr x13, [x6, #960] + ldr w7, [x6, x7, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x8, x12, #16, #8 + eor w7, w7, w10, ror 24 + ubfx x10, x11, #56, #8 + eor w7, w7, w15, ror 8 + ubfx x15, x12, #40, #8 + eor w7, w7, w16, ror 16 + ubfx x16, x11, #0, #8 + ldr w8, [x6, x8, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x9, x12, #48, #8 + eor w8, w8, w10, ror 24 + ubfx x10, x12, #24, #8 + eor w8, w8, w15, ror 8 + ubfx x15, x11, #8, #8 + eor w8, w8, w16, ror 16 + ubfx x16, x11, #32, #8 + bfi x7, x8, #32, #32 + ldr w9, [x6, x9, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x13, x12, #0, #8 + eor w9, w9, w10, ror 24 + ubfx x10, x11, #16, #8 + eor w9, w9, w15, ror 8 + ubfx x15, x12, #56, #8 + eor w8, w9, w16, ror 16 + ubfx x16, x11, #40, #8 + ldr w13, [x6, x13, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w16, [x6, x16, LSL 2] + eor w15, w15, w13, ror 24 + ldp x11, x12, [x19], #16 + eor w10, w10, w15, ror 24 + eor w10, w10, w16, ror 8 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + subs w17, w17, #2 + bne L_AES_CBC_encrypt_loop_nr + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x15, x8, #8, #8 + ubfx x16, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w15, ror 8 + ubfx x15, x8, #40, #8 + eor w11, w11, w16, ror 16 + ubfx x16, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w15, ror 8 + ubfx x15, x7, #8, #8 + eor w12, w12, w16, ror 16 + ubfx x16, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w16, [x6, x16, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w15, ror 8 + ubfx x15, x8, #56, #8 + eor w12, w13, w16, ror 16 + ubfx x16, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w15, [x6, x15, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w16, [x6, x16, LSL 2] + eor w15, w15, w9, ror 24 + ldp x7, x8, [x19], #16 + eor w14, w14, w15, ror 24 + eor w14, w14, w16, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x12, #32, #8 + ubfx x10, x12, #8, #8 + ubfx x15, x11, #48, #8 + ubfx x16, x11, #24, #8 + lsl w7, w7, #2 + lsl w10, w10, #2 + lsl w15, w15, #2 + lsl w16, w16, #2 + ldr x14, [x6] + ldr x14, [x6, #64] + ldr x14, [x6, #128] + ldr x14, [x6, #192] + ldr x14, [x6, #256] + ldr x14, [x6, #320] + ldr x14, [x6, #384] + ldr x14, [x6, #448] + ldr x14, [x6, #512] + ldr x14, [x6, #576] + ldr x14, [x6, #640] + ldr x14, [x6, #704] + ldr x14, [x6, #768] + ldr x14, [x6, #832] + ldr x14, [x6, #896] + ldr x14, [x6, #960] + ldrb w7, [x6, x7, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x8, x11, #0, #8 + eor w7, w7, w10, lsl 8 + ubfx x10, x12, #40, #8 + eor w7, w7, w15, lsl 16 + ubfx x15, x12, #16, #8 + eor w7, w7, w16, lsl 24 + ubfx x16, x11, #56, #8 + lsl w8, w8, #2 + lsl w10, w10, #2 + lsl w15, w15, #2 + lsl w16, w16, #2 + ldrb w8, [x6, x8, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x9, x11, #32, #8 + eor w8, w8, w10, lsl 8 + ubfx x10, x11, #8, #8 + eor w8, w8, w15, lsl 16 + ubfx x15, x12, #48, #8 + eor w8, w8, w16, lsl 24 + ubfx x16, x12, #24, #8 + bfi x7, x8, #32, #32 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w15, w15, #2 + lsl w16, w16, #2 + ldrb w9, [x6, x9, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x14, x12, #56, #8 + eor w9, w9, w10, lsl 8 + ubfx x10, x12, #0, #8 + eor w9, w9, w15, lsl 16 + ubfx x15, x11, #40, #8 + eor w8, w9, w16, lsl 24 + ubfx x16, x11, #16, #8 + lsl w14, w14, #2 + lsl w10, w10, #2 + lsl w15, w15, #2 + lsl w16, w16, #2 + ldrb w14, [x6, x14, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + eor w15, w15, w14, lsl 16 + ldp x11, x12, [x19] + eor w10, w10, w15, lsl 8 + eor w10, w10, w16, lsl 16 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + rev32 x7, x7 + rev32 x8, x8 + str x7, [x1] + str x8, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_CBC_encrypt_loop_block + stp x7, x8, [x5] + ldp x17, x19, [x29, #16] + ldp x29, x30, [sp], #32 + ret +#ifndef __APPLE__ + .size AES_CBC_encrypt,.-AES_CBC_encrypt +#endif /* __APPLE__ */ +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +#ifndef __APPLE__ +.text +.globl AES_CTR_encrypt +.type AES_CTR_encrypt,@function +.align 2 +AES_CTR_encrypt: +#else +.section __TEXT,__text +.globl _AES_CTR_encrypt +.p2align 2 +_AES_CTR_encrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp x17, x19, [x29, #16] + stp x20, x21, [x29, #32] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_te + add x6, x6, :lo12:L_AES_ARM64_te +#else + adrp x6, L_AES_ARM64_te@PAGE + add x6, x6, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + ldp x15, x16, [x5] + rev32 x15, x15 + rev32 x16, x16 +L_AES_CTR_encrypt_loop_block_128: + mov x21, x3 + ldp x11, x12, [x21], #16 + # Round: 0 - XOR in key schedule + eor x7, x15, x11 + eor x8, x16, x12 + sub w20, w4, #2 +L_AES_CTR_encrypt_loop_nr: + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x17, x8, #8, #8 + ubfx x19, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w17, ror 8 + ubfx x17, x8, #40, #8 + eor w11, w11, w19, ror 16 + ubfx x19, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w17, ror 8 + ubfx x17, x7, #8, #8 + eor w12, w12, w19, ror 16 + ubfx x19, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w17, ror 8 + ubfx x17, x8, #56, #8 + eor w12, w13, w19, ror 16 + ubfx x19, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + eor w17, w17, w9, ror 24 + ldp x7, x8, [x21], #16 + eor w14, w14, w17, ror 24 + eor w14, w14, w19, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x11, #48, #8 + ubfx x10, x11, #24, #8 + ubfx x17, x12, #8, #8 + ubfx x19, x12, #32, #8 + ldr x13, [x6] + ldr x13, [x6, #64] + ldr x13, [x6, #128] + ldr x13, [x6, #192] + ldr x13, [x6, #256] + ldr x13, [x6, #320] + ldr x13, [x6, #384] + ldr x13, [x6, #448] + ldr x13, [x6, #512] + ldr x13, [x6, #576] + ldr x13, [x6, #640] + ldr x13, [x6, #704] + ldr x13, [x6, #768] + ldr x13, [x6, #832] + ldr x13, [x6, #896] + ldr x13, [x6, #960] + ldr w7, [x6, x7, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x8, x12, #16, #8 + eor w7, w7, w10, ror 24 + ubfx x10, x11, #56, #8 + eor w7, w7, w17, ror 8 + ubfx x17, x12, #40, #8 + eor w7, w7, w19, ror 16 + ubfx x19, x11, #0, #8 + ldr w8, [x6, x8, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x9, x12, #48, #8 + eor w8, w8, w10, ror 24 + ubfx x10, x12, #24, #8 + eor w8, w8, w17, ror 8 + ubfx x17, x11, #8, #8 + eor w8, w8, w19, ror 16 + ubfx x19, x11, #32, #8 + bfi x7, x8, #32, #32 + ldr w9, [x6, x9, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x13, x12, #0, #8 + eor w9, w9, w10, ror 24 + ubfx x10, x11, #16, #8 + eor w9, w9, w17, ror 8 + ubfx x17, x12, #56, #8 + eor w8, w9, w19, ror 16 + ubfx x19, x11, #40, #8 + ldr w13, [x6, x13, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w10, [x6, x10, LSL 2] + ldr w19, [x6, x19, LSL 2] + eor w17, w17, w13, ror 24 + ldp x11, x12, [x21], #16 + eor w10, w10, w17, ror 24 + eor w10, w10, w19, ror 8 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + subs w20, w20, #2 + bne L_AES_CTR_encrypt_loop_nr + ubfx x11, x7, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x17, x8, #8, #8 + ubfx x19, x8, #32, #8 + ldr x9, [x6] + ldr x9, [x6, #64] + ldr x9, [x6, #128] + ldr x9, [x6, #192] + ldr x9, [x6, #256] + ldr x9, [x6, #320] + ldr x9, [x6, #384] + ldr x9, [x6, #448] + ldr x9, [x6, #512] + ldr x9, [x6, #576] + ldr x9, [x6, #640] + ldr x9, [x6, #704] + ldr x9, [x6, #768] + ldr x9, [x6, #832] + ldr x9, [x6, #896] + ldr x9, [x6, #960] + ldr w11, [x6, x11, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x12, x8, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w17, ror 8 + ubfx x17, x8, #40, #8 + eor w11, w11, w19, ror 16 + ubfx x19, x7, #0, #8 + ldr w12, [x6, x12, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x13, x8, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w17, ror 8 + ubfx x17, x7, #8, #8 + eor w12, w12, w19, ror 16 + ubfx x19, x7, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x6, x13, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w19, [x6, x19, LSL 2] + ubfx x9, x8, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x7, #16, #8 + eor w13, w13, w17, ror 8 + ubfx x17, x8, #56, #8 + eor w12, w13, w19, ror 16 + ubfx x19, x7, #40, #8 + ldr w9, [x6, x9, LSL 2] + ldr w17, [x6, x17, LSL 2] + ldr w14, [x6, x14, LSL 2] + ldr w19, [x6, x19, LSL 2] + eor w17, w17, w9, ror 24 + ldp x7, x8, [x21], #16 + eor w14, w14, w17, ror 24 + eor w14, w14, w19, ror 8 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x12, #32, #8 + ubfx x10, x12, #8, #8 + ubfx x17, x11, #48, #8 + ubfx x19, x11, #24, #8 + lsl w7, w7, #2 + lsl w10, w10, #2 + lsl w17, w17, #2 + lsl w19, w19, #2 + ldr x14, [x6] + ldr x14, [x6, #64] + ldr x14, [x6, #128] + ldr x14, [x6, #192] + ldr x14, [x6, #256] + ldr x14, [x6, #320] + ldr x14, [x6, #384] + ldr x14, [x6, #448] + ldr x14, [x6, #512] + ldr x14, [x6, #576] + ldr x14, [x6, #640] + ldr x14, [x6, #704] + ldr x14, [x6, #768] + ldr x14, [x6, #832] + ldr x14, [x6, #896] + ldr x14, [x6, #960] + ldrb w7, [x6, x7, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ubfx x8, x11, #0, #8 + eor w7, w7, w10, lsl 8 + ubfx x10, x12, #40, #8 + eor w7, w7, w17, lsl 16 + ubfx x17, x12, #16, #8 + eor w7, w7, w19, lsl 24 + ubfx x19, x11, #56, #8 + lsl w8, w8, #2 + lsl w10, w10, #2 + lsl w17, w17, #2 + lsl w19, w19, #2 + ldrb w8, [x6, x8, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ubfx x9, x11, #32, #8 + eor w8, w8, w10, lsl 8 + ubfx x10, x11, #8, #8 + eor w8, w8, w17, lsl 16 + ubfx x17, x12, #48, #8 + eor w8, w8, w19, lsl 24 + ubfx x19, x12, #24, #8 + bfi x7, x8, #32, #32 + lsl w9, w9, #2 + lsl w10, w10, #2 + lsl w17, w17, #2 + lsl w19, w19, #2 + ldrb w9, [x6, x9, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w19, [x6, x19, LSL 0] + ubfx x14, x12, #56, #8 + eor w9, w9, w10, lsl 8 + ubfx x10, x12, #0, #8 + eor w9, w9, w17, lsl 16 + ubfx x17, x11, #40, #8 + eor w8, w9, w19, lsl 24 + ubfx x19, x11, #16, #8 + lsl w14, w14, #2 + lsl w10, w10, #2 + lsl w17, w17, #2 + lsl w19, w19, #2 + ldrb w14, [x6, x14, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w19, [x6, x19, LSL 0] + eor w17, w17, w14, lsl 16 + ldp x11, x12, [x21] + eor w10, w10, w17, lsl 8 + eor w10, w10, w19, lsl 16 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + rev32 x7, x7 + rev32 x8, x8 + ldr x11, [x0] + ldr x12, [x0, #8] + eor x7, x7, x11 + eor x8, x8, x12 + str x7, [x1] + str x8, [x1, #8] + ror x16, x16, #32 + ror x15, x15, #32 + adds x16, x16, #1 + adc x15, x15, xzr + ror x16, x16, #32 + ror x15, x15, #32 + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_CTR_encrypt_loop_block_128 + rev32 x15, x15 + rev32 x16, x16 + stp x15, x16, [x5] + ldp x17, x19, [x29, #16] + ldp x20, x21, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size AES_CTR_encrypt,.-AES_CTR_encrypt +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB) +#ifndef __APPLE__ + .text + .type L_AES_ARM64_td4, %object + .section .rodata + .size L_AES_ARM64_td4, 256 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 1 +#else + .p2align 1 +#endif /* __APPLE__ */ +L_AES_ARM64_td4: + .byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38 + .byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb + .byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87 + .byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb + .byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d + .byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e + .byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2 + .byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25 + .byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16 + .byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92 + .byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda + .byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84 + .byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a + .byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06 + .byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02 + .byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b + .byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea + .byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73 + .byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85 + .byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e + .byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89 + .byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b + .byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20 + .byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4 + .byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31 + .byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f + .byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d + .byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef + .byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0 + .byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61 + .byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26 + .byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB) +#ifndef __APPLE__ +.text +.globl AES_ECB_decrypt +.type AES_ECB_decrypt,@function +.align 2 +AES_ECB_decrypt: +#else +.section __TEXT,__text +.globl _AES_ECB_decrypt +.p2align 2 +_AES_ECB_decrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-32]! + add x29, sp, #0 + stp x17, x19, [x29, #16] +#ifndef __APPLE__ + adrp x5, L_AES_ARM64_td + add x5, x5, :lo12:L_AES_ARM64_td +#else + adrp x5, L_AES_ARM64_td@PAGE + add x5, x5, :lo12:L_AES_ARM64_td@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_td4 + add x6, x6, :lo12:L_AES_ARM64_td4 +#else + adrp x6, L_AES_ARM64_td4@PAGE + add x6, x6, :lo12:L_AES_ARM64_td4@PAGEOFF +#endif /* __APPLE__ */ +L_AES_ECB_decrypt_loop_block: + mov x19, x3 + ldr x7, [x0] + ldr x8, [x0, #8] + rev32 x7, x7 + rev32 x8, x8 + ldp x11, x12, [x19], #16 + # Round: 0 - XOR in key schedule + eor x7, x7, x11 + eor x8, x8, x12 + sub w17, w4, #2 +L_AES_ECB_decrypt_loop_nr: + ubfx x11, x8, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x15, x8, #8, #8 + ubfx x16, x7, #32, #8 + ldr x9, [x5] + ldr x9, [x5, #64] + ldr x9, [x5, #128] + ldr x9, [x5, #192] + ldr x9, [x5, #256] + ldr x9, [x5, #320] + ldr x9, [x5, #384] + ldr x9, [x5, #448] + ldr x9, [x5, #512] + ldr x9, [x5, #576] + ldr x9, [x5, #640] + ldr x9, [x5, #704] + ldr x9, [x5, #768] + ldr x9, [x5, #832] + ldr x9, [x5, #896] + ldr x9, [x5, #960] + ldr w11, [x5, x11, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x12, x7, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w15, ror 8 + ubfx x15, x8, #40, #8 + eor w11, w11, w16, ror 16 + ubfx x16, x8, #0, #8 + ldr w12, [x5, x12, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x13, x7, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w15, ror 8 + ubfx x15, x7, #8, #8 + eor w12, w12, w16, ror 16 + ubfx x16, x8, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x9, x7, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x8, #16, #8 + eor w13, w13, w15, ror 8 + ubfx x15, x8, #56, #8 + eor w12, w13, w16, ror 16 + ubfx x16, x7, #40, #8 + ldr w9, [x5, x9, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w16, [x5, x16, LSL 2] + eor w15, w15, w9, ror 24 + ldp x7, x8, [x19], #16 + eor w14, w14, w16, ror 8 + eor w14, w14, w15, ror 24 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x12, #48, #8 + ubfx x10, x11, #24, #8 + ubfx x15, x12, #8, #8 + ubfx x16, x11, #32, #8 + ldr x13, [x5] + ldr x13, [x5, #64] + ldr x13, [x5, #128] + ldr x13, [x5, #192] + ldr x13, [x5, #256] + ldr x13, [x5, #320] + ldr x13, [x5, #384] + ldr x13, [x5, #448] + ldr x13, [x5, #512] + ldr x13, [x5, #576] + ldr x13, [x5, #640] + ldr x13, [x5, #704] + ldr x13, [x5, #768] + ldr x13, [x5, #832] + ldr x13, [x5, #896] + ldr x13, [x5, #960] + ldr w7, [x5, x7, LSL 2] + ldr w10, [x5, x10, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x8, x11, #16, #8 + eor w7, w7, w10, ror 24 + ubfx x10, x11, #56, #8 + eor w7, w7, w15, ror 8 + ubfx x15, x12, #40, #8 + eor w7, w7, w16, ror 16 + ubfx x16, x12, #0, #8 + ldr w8, [x5, x8, LSL 2] + ldr w10, [x5, x10, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x9, x11, #48, #8 + eor w8, w8, w10, ror 24 + ubfx x10, x12, #24, #8 + eor w8, w8, w15, ror 8 + ubfx x15, x11, #8, #8 + eor w8, w8, w16, ror 16 + ubfx x16, x12, #32, #8 + bfi x7, x8, #32, #32 + ldr w9, [x5, x9, LSL 2] + ldr w10, [x5, x10, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x13, x11, #0, #8 + eor w9, w9, w10, ror 24 + ubfx x10, x12, #16, #8 + eor w9, w9, w15, ror 8 + ubfx x15, x12, #56, #8 + eor w8, w9, w16, ror 16 + ubfx x16, x11, #40, #8 + ldr w13, [x5, x13, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w10, [x5, x10, LSL 2] + ldr w16, [x5, x16, LSL 2] + eor w15, w15, w13, ror 24 + ldp x11, x12, [x19], #16 + eor w10, w10, w16, ror 8 + eor w10, w10, w15, ror 24 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + subs w17, w17, #2 + bne L_AES_ECB_decrypt_loop_nr + ubfx x11, x8, #48, #8 + ubfx x14, x7, #24, #8 + ubfx x15, x8, #8, #8 + ubfx x16, x7, #32, #8 + ldr x9, [x5] + ldr x9, [x5, #64] + ldr x9, [x5, #128] + ldr x9, [x5, #192] + ldr x9, [x5, #256] + ldr x9, [x5, #320] + ldr x9, [x5, #384] + ldr x9, [x5, #448] + ldr x9, [x5, #512] + ldr x9, [x5, #576] + ldr x9, [x5, #640] + ldr x9, [x5, #704] + ldr x9, [x5, #768] + ldr x9, [x5, #832] + ldr x9, [x5, #896] + ldr x9, [x5, #960] + ldr w11, [x5, x11, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x12, x7, #16, #8 + eor w11, w11, w14, ror 24 + ubfx x14, x7, #56, #8 + eor w11, w11, w15, ror 8 + ubfx x15, x8, #40, #8 + eor w11, w11, w16, ror 16 + ubfx x16, x8, #0, #8 + ldr w12, [x5, x12, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x13, x7, #48, #8 + eor w12, w12, w14, ror 24 + ubfx x14, x8, #24, #8 + eor w12, w12, w15, ror 8 + ubfx x15, x7, #8, #8 + eor w12, w12, w16, ror 16 + ubfx x16, x8, #32, #8 + bfi x11, x12, #32, #32 + ldr w13, [x5, x13, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w16, [x5, x16, LSL 2] + ubfx x9, x7, #0, #8 + eor w13, w13, w14, ror 24 + ubfx x14, x8, #16, #8 + eor w13, w13, w15, ror 8 + ubfx x15, x8, #56, #8 + eor w12, w13, w16, ror 16 + ubfx x16, x7, #40, #8 + ldr w9, [x5, x9, LSL 2] + ldr w15, [x5, x15, LSL 2] + ldr w14, [x5, x14, LSL 2] + ldr w16, [x5, x16, LSL 2] + eor w15, w15, w9, ror 24 + ldp x7, x8, [x19], #16 + eor w14, w14, w16, ror 8 + eor w14, w14, w15, ror 24 + bfi x12, x14, #32, #32 + # XOR in Key Schedule + eor x11, x11, x7 + eor x12, x12, x8 + ubfx x7, x11, #32, #8 + ubfx x10, x12, #8, #8 + ubfx x15, x12, #48, #8 + ubfx x16, x11, #24, #8 + ldr x14, [x6] + ldr x14, [x6, #64] + ldr x14, [x6, #128] + ldr x14, [x6, #192] + ldr x14, [x6, #256] + ldr x14, [x6, #320] + ldr x14, [x6, #384] + ldr x14, [x6, #448] + ldr x14, [x6, #512] + ldr x14, [x6, #576] + ldr x14, [x6, #640] + ldr x14, [x6, #704] + ldr x14, [x6, #768] + ldr x14, [x6, #832] + ldr x14, [x6, #896] + ldr x14, [x6, #960] + ldrb w7, [x6, x7, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x8, x12, #0, #8 + eor w7, w7, w10, lsl 8 + ubfx x10, x12, #40, #8 + eor w7, w7, w15, lsl 16 + ubfx x15, x11, #16, #8 + eor w7, w7, w16, lsl 24 + ubfx x16, x11, #56, #8 + ldrb w10, [x6, x10, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w8, [x6, x8, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ubfx x9, x12, #32, #8 + eor w8, w8, w10, lsl 8 + ubfx x10, x11, #8, #8 + eor w8, w8, w15, lsl 16 + ubfx x15, x11, #48, #8 + eor w8, w8, w16, lsl 24 + ubfx x16, x12, #24, #8 + bfi x7, x8, #32, #32 + ldrb w10, [x6, x10, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w9, [x6, x9, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ubfx x14, x12, #56, #8 + eor w9, w9, w10, lsl 8 + ubfx x10, x11, #0, #8 + eor w9, w9, w15, lsl 16 + ubfx x15, x11, #40, #8 + eor w8, w9, w16, lsl 24 + ubfx x16, x12, #16, #8 + ldrb w14, [x6, x14, LSL 0] + ldrb w15, [x6, x15, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w16, [x6, x16, LSL 0] + eor w15, w15, w14, lsl 16 + ldp x11, x12, [x19] + eor w10, w10, w15, lsl 8 + eor w10, w10, w16, lsl 16 + bfi x8, x10, #32, #32 + # XOR in Key Schedule + eor x7, x7, x11 + eor x8, x8, x12 + rev32 x7, x7 + rev32 x8, x8 + str x7, [x1] + str x8, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_ECB_decrypt_loop_block + ldp x17, x19, [x29, #16] + ldp x29, x30, [sp], #32 + ret +#ifndef __APPLE__ + .size AES_ECB_decrypt,.-AES_ECB_decrypt +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */ +#ifdef HAVE_AES_CBC +#ifndef __APPLE__ +.text +.globl AES_CBC_decrypt +.type AES_CBC_decrypt,@function +.align 2 +AES_CBC_decrypt: +#else +.section __TEXT,__text +.globl _AES_CBC_decrypt +.p2align 2 +_AES_CBC_decrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + str x20, [x29, #40] +#ifndef __APPLE__ + adrp x6, L_AES_ARM64_td4 + add x6, x6, :lo12:L_AES_ARM64_td4 +#else + adrp x6, L_AES_ARM64_td4@PAGE + add x6, x6, :lo12:L_AES_ARM64_td4@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x7, L_AES_ARM64_td + add x7, x7, :lo12:L_AES_ARM64_td +#else + adrp x7, L_AES_ARM64_td@PAGE + add x7, x7, :lo12:L_AES_ARM64_td@PAGEOFF +#endif /* __APPLE__ */ +L_AES_CBC_decrypt_loop_block: + mov x20, x3 + ldr x8, [x0] + ldr x9, [x0, #8] + stnp x8, x9, [x5, #16] + rev32 x8, x8 + rev32 x9, x9 + ldp x12, x13, [x20], #16 + # Round: 0 - XOR in key schedule + eor x8, x8, x12 + eor x9, x9, x13 + sub w19, w4, #2 +L_AES_CBC_decrypt_loop_nr_even: + ubfx x12, x9, #48, #8 + ubfx x15, x8, #24, #8 + ubfx x16, x9, #8, #8 + ubfx x17, x8, #32, #8 + ldr x10, [x7] + ldr x10, [x7, #64] + ldr x10, [x7, #128] + ldr x10, [x7, #192] + ldr x10, [x7, #256] + ldr x10, [x7, #320] + ldr x10, [x7, #384] + ldr x10, [x7, #448] + ldr x10, [x7, #512] + ldr x10, [x7, #576] + ldr x10, [x7, #640] + ldr x10, [x7, #704] + ldr x10, [x7, #768] + ldr x10, [x7, #832] + ldr x10, [x7, #896] + ldr x10, [x7, #960] + ldr w12, [x7, x12, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x13, x8, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x8, #56, #8 + eor w12, w12, w16, ror 8 + ubfx x16, x9, #40, #8 + eor w12, w12, w17, ror 16 + ubfx x17, x9, #0, #8 + ldr w13, [x7, x13, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x8, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x9, #24, #8 + eor w13, w13, w16, ror 8 + ubfx x16, x8, #8, #8 + eor w13, w13, w17, ror 16 + ubfx x17, x9, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x7, x14, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x8, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x9, #16, #8 + eor w14, w14, w16, ror 8 + ubfx x16, x9, #56, #8 + eor w13, w14, w17, ror 16 + ubfx x17, x8, #40, #8 + ldr w10, [x7, x10, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w10, ror 24 + ldp x8, x9, [x20], #16 + eor w15, w15, w17, ror 8 + eor w15, w15, w16, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x8 + eor x13, x13, x9 + ubfx x8, x13, #48, #8 + ubfx x11, x12, #24, #8 + ubfx x16, x13, #8, #8 + ubfx x17, x12, #32, #8 + ldr x14, [x7] + ldr x14, [x7, #64] + ldr x14, [x7, #128] + ldr x14, [x7, #192] + ldr x14, [x7, #256] + ldr x14, [x7, #320] + ldr x14, [x7, #384] + ldr x14, [x7, #448] + ldr x14, [x7, #512] + ldr x14, [x7, #576] + ldr x14, [x7, #640] + ldr x14, [x7, #704] + ldr x14, [x7, #768] + ldr x14, [x7, #832] + ldr x14, [x7, #896] + ldr x14, [x7, #960] + ldr w8, [x7, x8, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x9, x12, #16, #8 + eor w8, w8, w11, ror 24 + ubfx x11, x12, #56, #8 + eor w8, w8, w16, ror 8 + ubfx x16, x13, #40, #8 + eor w8, w8, w17, ror 16 + ubfx x17, x13, #0, #8 + ldr w9, [x7, x9, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x12, #48, #8 + eor w9, w9, w11, ror 24 + ubfx x11, x13, #24, #8 + eor w9, w9, w16, ror 8 + ubfx x16, x12, #8, #8 + eor w9, w9, w17, ror 16 + ubfx x17, x13, #32, #8 + bfi x8, x9, #32, #32 + ldr w10, [x7, x10, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x12, #0, #8 + eor w10, w10, w11, ror 24 + ubfx x11, x13, #16, #8 + eor w10, w10, w16, ror 8 + ubfx x16, x13, #56, #8 + eor w9, w10, w17, ror 16 + ubfx x17, x12, #40, #8 + ldr w14, [x7, x14, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w14, ror 24 + ldp x12, x13, [x20], #16 + eor w11, w11, w17, ror 8 + eor w11, w11, w16, ror 24 + bfi x9, x11, #32, #32 + # XOR in Key Schedule + eor x8, x8, x12 + eor x9, x9, x13 + subs w19, w19, #2 + bne L_AES_CBC_decrypt_loop_nr_even + ubfx x12, x9, #48, #8 + ubfx x15, x8, #24, #8 + ubfx x16, x9, #8, #8 + ubfx x17, x8, #32, #8 + ldr x10, [x7] + ldr x10, [x7, #64] + ldr x10, [x7, #128] + ldr x10, [x7, #192] + ldr x10, [x7, #256] + ldr x10, [x7, #320] + ldr x10, [x7, #384] + ldr x10, [x7, #448] + ldr x10, [x7, #512] + ldr x10, [x7, #576] + ldr x10, [x7, #640] + ldr x10, [x7, #704] + ldr x10, [x7, #768] + ldr x10, [x7, #832] + ldr x10, [x7, #896] + ldr x10, [x7, #960] + ldr w12, [x7, x12, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x13, x8, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x8, #56, #8 + eor w12, w12, w16, ror 8 + ubfx x16, x9, #40, #8 + eor w12, w12, w17, ror 16 + ubfx x17, x9, #0, #8 + ldr w13, [x7, x13, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x8, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x9, #24, #8 + eor w13, w13, w16, ror 8 + ubfx x16, x8, #8, #8 + eor w13, w13, w17, ror 16 + ubfx x17, x9, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x7, x14, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x8, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x9, #16, #8 + eor w14, w14, w16, ror 8 + ubfx x16, x9, #56, #8 + eor w13, w14, w17, ror 16 + ubfx x17, x8, #40, #8 + ldr w10, [x7, x10, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w10, ror 24 + ldp x8, x9, [x20], #16 + eor w15, w15, w17, ror 8 + eor w15, w15, w16, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x8 + eor x13, x13, x9 + ubfx x8, x12, #32, #8 + ubfx x11, x13, #8, #8 + ubfx x16, x13, #48, #8 + ubfx x17, x12, #24, #8 + ldr x15, [x6] + ldr x15, [x6, #64] + ldr x15, [x6, #128] + ldr x15, [x6, #192] + ldr x15, [x6, #256] + ldr x15, [x6, #320] + ldr x15, [x6, #384] + ldr x15, [x6, #448] + ldr x15, [x6, #512] + ldr x15, [x6, #576] + ldr x15, [x6, #640] + ldr x15, [x6, #704] + ldr x15, [x6, #768] + ldr x15, [x6, #832] + ldr x15, [x6, #896] + ldr x15, [x6, #960] + ldrb w8, [x6, x8, LSL 0] + ldrb w11, [x6, x11, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ubfx x9, x13, #0, #8 + eor w8, w8, w11, lsl 8 + ubfx x11, x13, #40, #8 + eor w8, w8, w16, lsl 16 + ubfx x16, x12, #16, #8 + eor w8, w8, w17, lsl 24 + ubfx x17, x12, #56, #8 + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w9, [x6, x9, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x10, x13, #32, #8 + eor w9, w9, w11, lsl 8 + ubfx x11, x12, #8, #8 + eor w9, w9, w16, lsl 16 + ubfx x16, x12, #48, #8 + eor w9, w9, w17, lsl 24 + ubfx x17, x13, #24, #8 + bfi x8, x9, #32, #32 + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x15, x13, #56, #8 + eor w10, w10, w11, lsl 8 + ubfx x11, x12, #0, #8 + eor w10, w10, w16, lsl 16 + ubfx x16, x12, #40, #8 + eor w9, w10, w17, lsl 24 + ubfx x17, x13, #16, #8 + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + eor w16, w16, w15, lsl 16 + ldp x12, x13, [x20] + eor w11, w11, w16, lsl 8 + eor w11, w11, w17, lsl 16 + bfi x9, x11, #32, #32 + # XOR in Key Schedule + eor x8, x8, x12 + eor x9, x9, x13 + rev32 x8, x8 + rev32 x9, x9 + ldp x12, x13, [x5] + eor x8, x8, x12 + eor x9, x9, x13 + str x8, [x1] + str x9, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + beq L_AES_CBC_decrypt_end_dec_odd + mov x20, x3 + ldr x8, [x0] + ldr x9, [x0, #8] + stp x8, x9, [x5] + rev32 x8, x8 + rev32 x9, x9 + ldp x12, x13, [x20], #16 + # Round: 0 - XOR in key schedule + eor x8, x8, x12 + eor x9, x9, x13 + sub w19, w4, #2 +L_AES_CBC_decrypt_loop_nr_odd: + ubfx x12, x9, #48, #8 + ubfx x15, x8, #24, #8 + ubfx x16, x9, #8, #8 + ubfx x17, x8, #32, #8 + ldr x10, [x7] + ldr x10, [x7, #64] + ldr x10, [x7, #128] + ldr x10, [x7, #192] + ldr x10, [x7, #256] + ldr x10, [x7, #320] + ldr x10, [x7, #384] + ldr x10, [x7, #448] + ldr x10, [x7, #512] + ldr x10, [x7, #576] + ldr x10, [x7, #640] + ldr x10, [x7, #704] + ldr x10, [x7, #768] + ldr x10, [x7, #832] + ldr x10, [x7, #896] + ldr x10, [x7, #960] + ldr w12, [x7, x12, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x13, x8, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x8, #56, #8 + eor w12, w12, w16, ror 8 + ubfx x16, x9, #40, #8 + eor w12, w12, w17, ror 16 + ubfx x17, x9, #0, #8 + ldr w13, [x7, x13, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x8, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x9, #24, #8 + eor w13, w13, w16, ror 8 + ubfx x16, x8, #8, #8 + eor w13, w13, w17, ror 16 + ubfx x17, x9, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x7, x14, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x8, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x9, #16, #8 + eor w14, w14, w16, ror 8 + ubfx x16, x9, #56, #8 + eor w13, w14, w17, ror 16 + ubfx x17, x8, #40, #8 + ldr w10, [x7, x10, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w10, ror 24 + ldp x8, x9, [x20], #16 + eor w15, w15, w17, ror 8 + eor w15, w15, w16, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x8 + eor x13, x13, x9 + ubfx x8, x13, #48, #8 + ubfx x11, x12, #24, #8 + ubfx x16, x13, #8, #8 + ubfx x17, x12, #32, #8 + ldr x14, [x7] + ldr x14, [x7, #64] + ldr x14, [x7, #128] + ldr x14, [x7, #192] + ldr x14, [x7, #256] + ldr x14, [x7, #320] + ldr x14, [x7, #384] + ldr x14, [x7, #448] + ldr x14, [x7, #512] + ldr x14, [x7, #576] + ldr x14, [x7, #640] + ldr x14, [x7, #704] + ldr x14, [x7, #768] + ldr x14, [x7, #832] + ldr x14, [x7, #896] + ldr x14, [x7, #960] + ldr w8, [x7, x8, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x9, x12, #16, #8 + eor w8, w8, w11, ror 24 + ubfx x11, x12, #56, #8 + eor w8, w8, w16, ror 8 + ubfx x16, x13, #40, #8 + eor w8, w8, w17, ror 16 + ubfx x17, x13, #0, #8 + ldr w9, [x7, x9, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x12, #48, #8 + eor w9, w9, w11, ror 24 + ubfx x11, x13, #24, #8 + eor w9, w9, w16, ror 8 + ubfx x16, x12, #8, #8 + eor w9, w9, w17, ror 16 + ubfx x17, x13, #32, #8 + bfi x8, x9, #32, #32 + ldr w10, [x7, x10, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x12, #0, #8 + eor w10, w10, w11, ror 24 + ubfx x11, x13, #16, #8 + eor w10, w10, w16, ror 8 + ubfx x16, x13, #56, #8 + eor w9, w10, w17, ror 16 + ubfx x17, x12, #40, #8 + ldr w14, [x7, x14, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w11, [x7, x11, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w14, ror 24 + ldp x12, x13, [x20], #16 + eor w11, w11, w17, ror 8 + eor w11, w11, w16, ror 24 + bfi x9, x11, #32, #32 + # XOR in Key Schedule + eor x8, x8, x12 + eor x9, x9, x13 + subs w19, w19, #2 + bne L_AES_CBC_decrypt_loop_nr_odd + ubfx x12, x9, #48, #8 + ubfx x15, x8, #24, #8 + ubfx x16, x9, #8, #8 + ubfx x17, x8, #32, #8 + ldr x10, [x7] + ldr x10, [x7, #64] + ldr x10, [x7, #128] + ldr x10, [x7, #192] + ldr x10, [x7, #256] + ldr x10, [x7, #320] + ldr x10, [x7, #384] + ldr x10, [x7, #448] + ldr x10, [x7, #512] + ldr x10, [x7, #576] + ldr x10, [x7, #640] + ldr x10, [x7, #704] + ldr x10, [x7, #768] + ldr x10, [x7, #832] + ldr x10, [x7, #896] + ldr x10, [x7, #960] + ldr w12, [x7, x12, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x13, x8, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x8, #56, #8 + eor w12, w12, w16, ror 8 + ubfx x16, x9, #40, #8 + eor w12, w12, w17, ror 16 + ubfx x17, x9, #0, #8 + ldr w13, [x7, x13, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x14, x8, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x9, #24, #8 + eor w13, w13, w16, ror 8 + ubfx x16, x8, #8, #8 + eor w13, w13, w17, ror 16 + ubfx x17, x9, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x7, x14, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w17, [x7, x17, LSL 2] + ubfx x10, x8, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x9, #16, #8 + eor w14, w14, w16, ror 8 + ubfx x16, x9, #56, #8 + eor w13, w14, w17, ror 16 + ubfx x17, x8, #40, #8 + ldr w10, [x7, x10, LSL 2] + ldr w16, [x7, x16, LSL 2] + ldr w15, [x7, x15, LSL 2] + ldr w17, [x7, x17, LSL 2] + eor w16, w16, w10, ror 24 + ldp x8, x9, [x20], #16 + eor w15, w15, w17, ror 8 + eor w15, w15, w16, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x8 + eor x13, x13, x9 + ubfx x8, x12, #32, #8 + ubfx x11, x13, #8, #8 + ubfx x16, x13, #48, #8 + ubfx x17, x12, #24, #8 + ldr x15, [x6] + ldr x15, [x6, #64] + ldr x15, [x6, #128] + ldr x15, [x6, #192] + ldr x15, [x6, #256] + ldr x15, [x6, #320] + ldr x15, [x6, #384] + ldr x15, [x6, #448] + ldr x15, [x6, #512] + ldr x15, [x6, #576] + ldr x15, [x6, #640] + ldr x15, [x6, #704] + ldr x15, [x6, #768] + ldr x15, [x6, #832] + ldr x15, [x6, #896] + ldr x15, [x6, #960] + ldrb w8, [x6, x8, LSL 0] + ldrb w11, [x6, x11, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ubfx x9, x13, #0, #8 + eor w8, w8, w11, lsl 8 + ubfx x11, x13, #40, #8 + eor w8, w8, w16, lsl 16 + ubfx x16, x12, #16, #8 + eor w8, w8, w17, lsl 24 + ubfx x17, x12, #56, #8 + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w9, [x6, x9, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x10, x13, #32, #8 + eor w9, w9, w11, lsl 8 + ubfx x11, x12, #8, #8 + eor w9, w9, w16, lsl 16 + ubfx x16, x12, #48, #8 + eor w9, w9, w17, lsl 24 + ubfx x17, x13, #24, #8 + bfi x8, x9, #32, #32 + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + ldrb w10, [x6, x10, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ubfx x15, x13, #56, #8 + eor w10, w10, w11, lsl 8 + ubfx x11, x12, #0, #8 + eor w10, w10, w16, lsl 16 + ubfx x16, x12, #40, #8 + eor w9, w10, w17, lsl 24 + ubfx x17, x13, #16, #8 + ldrb w15, [x6, x15, LSL 0] + ldrb w16, [x6, x16, LSL 0] + ldrb w11, [x6, x11, LSL 0] + ldrb w17, [x6, x17, LSL 0] + eor w16, w16, w15, lsl 16 + ldp x12, x13, [x20] + eor w11, w11, w16, lsl 8 + eor w11, w11, w17, lsl 16 + bfi x9, x11, #32, #32 + # XOR in Key Schedule + eor x8, x8, x12 + eor x9, x9, x13 + rev32 x8, x8 + rev32 x9, x9 + ldnp x12, x13, [x5, #16] + eor x8, x8, x12 + eor x9, x9, x13 + str x8, [x1] + str x9, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_CBC_decrypt_loop_block + b L_AES_CBC_decrypt_end_dec +L_AES_CBC_decrypt_end_dec_odd: + ldnp x12, x13, [x5, #16] + stp x12, x13, [x5] +L_AES_CBC_decrypt_end_dec: + ldp x17, x19, [x29, #24] + ldr x20, [x29, #40] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size AES_CBC_decrypt,.-AES_CBC_decrypt +#endif /* __APPLE__ */ +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC + * HAVE_AES_ECB */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +#ifndef __APPLE__ + .text + .type L_GCM_gmult_len_r, %object + .section .rodata + .size L_GCM_gmult_len_r, 128 +#else + .section __DATA,__data +#endif /* __APPLE__ */ +#ifndef __APPLE__ + .align 3 +#else + .p2align 3 +#endif /* __APPLE__ */ +L_GCM_gmult_len_r: + .word 0x00000000 + .word 0x1c200000 + .word 0x38400000 + .word 0x24600000 + .word 0x70800000 + .word 0x6ca00000 + .word 0x48c00000 + .word 0x54e00000 + .word 0xe1000000 + .word 0xfd200000 + .word 0xd9400000 + .word 0xc5600000 + .word 0x91800000 + .word 0x8da00000 + .word 0xa9c00000 + .word 0xb5e00000 + .word 0x00000000 + .word 0x01c20000 + .word 0x03840000 + .word 0x02460000 + .word 0x07080000 + .word 0x06ca0000 + .word 0x048c0000 + .word 0x054e0000 + .word 0x0e100000 + .word 0x0fd20000 + .word 0x0d940000 + .word 0x0c560000 + .word 0x09180000 + .word 0x08da0000 + .word 0x0a9c0000 + .word 0x0b5e0000 +#ifndef __APPLE__ +.text +.globl GCM_gmult_len +.type GCM_gmult_len,@function +.align 2 +GCM_gmult_len: +#else +.section __TEXT,__text +.globl _GCM_gmult_len +.p2align 2 +_GCM_gmult_len: +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x10, L_GCM_gmult_len_r + add x10, x10, :lo12:L_GCM_gmult_len_r +#else + adrp x10, L_GCM_gmult_len_r@PAGE + add x10, x10, :lo12:L_GCM_gmult_len_r@PAGEOFF +#endif /* __APPLE__ */ +L_GCM_gmult_len_start_block: + ldp x4, x5, [x0] + ldp x6, x7, [x2] + eor x4, x4, x6 + eor x5, x5, x7 + ubfx x12, x5, #56, #4 + add x12, x1, x12, lsl 4 + ldp x8, x9, [x12] + ubfx x12, x5, #60, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #48, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #52, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #40, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #44, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #32, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #36, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #24, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #28, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #16, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #20, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #8, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #12, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x5, #0, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x5, #4, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #56, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #60, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #48, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #52, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #40, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #44, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #32, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #36, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #24, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #28, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #16, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #20, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfx x12, x4, #8, #4 + add x12, x1, x12, lsl 4 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x12, x4, #12, #4 + mov x11, x9 + add x12, x12, #16 + lsr x9, x9, #8 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 56 + ldp x6, x7, [x12] + lsr x8, x8, #8 + eor x8, x8, x6 + sub x12, x12, #0x100 + eor x9, x9, x7 + ldr x7, [x12, #8] + ubfx w6, w11, #0, #4 + eor x11, x11, x7, lsl 4 + add w6, w6, #16 + ubfx w11, w11, #4, #4 + ldr w6, [x10, x6, LSL 2] + ldr w7, [x10, x11, LSL 2] + eor x8, x8, x6, lsl 32 + eor x8, x8, x7, lsl 32 + ubfiz x12, x4, #4, #4 + add x12, x12, x1 + ldp x6, x7, [x12] + eor x8, x8, x6 + eor x9, x9, x7 + ubfx x11, x9, #0, #4 + ubfx x12, x4, #4, #4 + lsr x9, x9, #4 + add x12, x1, x12, lsl 4 + orr x9, x9, x8, lsl 60 + ldp x6, x7, [x12] + lsr x8, x8, #4 + eor x8, x8, x6 + ldr w6, [x10, x11, LSL 2] + eor x9, x9, x7 + eor x8, x8, x6, lsl 32 + rev x8, x8 + rev x9, x9 + stp x8, x9, [x0] + subs x3, x3, #16 + add x2, x2, #16 + bne L_GCM_gmult_len_start_block + ret +#ifndef __APPLE__ + .size GCM_gmult_len,.-GCM_gmult_len +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_GCM_encrypt +.type AES_GCM_encrypt,@function +.align 2 +AES_GCM_encrypt: +#else +.section __TEXT,__text +.globl _AES_GCM_encrypt +.p2align 2 +_AES_GCM_encrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-48]! + add x29, sp, #0 + stp x17, x19, [x29, #16] + stp x20, x21, [x29, #32] +#ifndef __APPLE__ + adrp x19, L_AES_ARM64_te + add x19, x19, :lo12:L_AES_ARM64_te +#else + adrp x19, L_AES_ARM64_te@PAGE + add x19, x19, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + ldp x16, x17, [x5] + rev32 x16, x16 + rev32 x17, x17 +L_AES_GCM_encrypt_loop_block: + mov x21, x3 + lsr x9, x17, #32 + ldp x10, x11, [x21], #16 + add w9, w9, #1 + bfi x17, x9, #32, #32 + # Round: 0 - XOR in key schedule + eor x6, x16, x10 + eor x7, x17, x11 + sub w20, w4, #2 +L_AES_GCM_encrypt_loop_nr: + ubfx x10, x6, #48, #8 + ubfx x13, x6, #24, #8 + ubfx x14, x7, #8, #8 + ubfx x15, x7, #32, #8 + ldr x8, [x19] + ldr x8, [x19, #64] + ldr x8, [x19, #128] + ldr x8, [x19, #192] + ldr x8, [x19, #256] + ldr x8, [x19, #320] + ldr x8, [x19, #384] + ldr x8, [x19, #448] + ldr x8, [x19, #512] + ldr x8, [x19, #576] + ldr x8, [x19, #640] + ldr x8, [x19, #704] + ldr x8, [x19, #768] + ldr x8, [x19, #832] + ldr x8, [x19, #896] + ldr x8, [x19, #960] + ldr w10, [x19, x10, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x11, x7, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x6, #56, #8 + eor w10, w10, w14, ror 8 + ubfx x14, x7, #40, #8 + eor w10, w10, w15, ror 16 + ubfx x15, x6, #0, #8 + ldr w11, [x19, x11, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x12, x7, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x7, #24, #8 + eor w11, w11, w14, ror 8 + ubfx x14, x6, #8, #8 + eor w11, w11, w15, ror 16 + ubfx x15, x6, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x19, x12, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x8, x7, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x6, #16, #8 + eor w12, w12, w14, ror 8 + ubfx x14, x7, #56, #8 + eor w11, w12, w15, ror 16 + ubfx x15, x6, #40, #8 + ldr w8, [x19, x8, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w15, [x19, x15, LSL 2] + eor w14, w14, w8, ror 24 + ldp x6, x7, [x21], #16 + eor w13, w13, w14, ror 24 + eor w13, w13, w15, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x6 + eor x11, x11, x7 + ubfx x6, x10, #48, #8 + ubfx x9, x10, #24, #8 + ubfx x14, x11, #8, #8 + ubfx x15, x11, #32, #8 + ldr x12, [x19] + ldr x12, [x19, #64] + ldr x12, [x19, #128] + ldr x12, [x19, #192] + ldr x12, [x19, #256] + ldr x12, [x19, #320] + ldr x12, [x19, #384] + ldr x12, [x19, #448] + ldr x12, [x19, #512] + ldr x12, [x19, #576] + ldr x12, [x19, #640] + ldr x12, [x19, #704] + ldr x12, [x19, #768] + ldr x12, [x19, #832] + ldr x12, [x19, #896] + ldr x12, [x19, #960] + ldr w6, [x19, x6, LSL 2] + ldr w9, [x19, x9, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x7, x11, #16, #8 + eor w6, w6, w9, ror 24 + ubfx x9, x10, #56, #8 + eor w6, w6, w14, ror 8 + ubfx x14, x11, #40, #8 + eor w6, w6, w15, ror 16 + ubfx x15, x10, #0, #8 + ldr w7, [x19, x7, LSL 2] + ldr w9, [x19, x9, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x8, x11, #48, #8 + eor w7, w7, w9, ror 24 + ubfx x9, x11, #24, #8 + eor w7, w7, w14, ror 8 + ubfx x14, x10, #8, #8 + eor w7, w7, w15, ror 16 + ubfx x15, x10, #32, #8 + bfi x6, x7, #32, #32 + ldr w8, [x19, x8, LSL 2] + ldr w9, [x19, x9, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x12, x11, #0, #8 + eor w8, w8, w9, ror 24 + ubfx x9, x10, #16, #8 + eor w8, w8, w14, ror 8 + ubfx x14, x11, #56, #8 + eor w7, w8, w15, ror 16 + ubfx x15, x10, #40, #8 + ldr w12, [x19, x12, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w9, [x19, x9, LSL 2] + ldr w15, [x19, x15, LSL 2] + eor w14, w14, w12, ror 24 + ldp x10, x11, [x21], #16 + eor w9, w9, w14, ror 24 + eor w9, w9, w15, ror 8 + bfi x7, x9, #32, #32 + # XOR in Key Schedule + eor x6, x6, x10 + eor x7, x7, x11 + subs w20, w20, #2 + bne L_AES_GCM_encrypt_loop_nr + ubfx x10, x6, #48, #8 + ubfx x13, x6, #24, #8 + ubfx x14, x7, #8, #8 + ubfx x15, x7, #32, #8 + ldr x8, [x19] + ldr x8, [x19, #64] + ldr x8, [x19, #128] + ldr x8, [x19, #192] + ldr x8, [x19, #256] + ldr x8, [x19, #320] + ldr x8, [x19, #384] + ldr x8, [x19, #448] + ldr x8, [x19, #512] + ldr x8, [x19, #576] + ldr x8, [x19, #640] + ldr x8, [x19, #704] + ldr x8, [x19, #768] + ldr x8, [x19, #832] + ldr x8, [x19, #896] + ldr x8, [x19, #960] + ldr w10, [x19, x10, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x11, x7, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x6, #56, #8 + eor w10, w10, w14, ror 8 + ubfx x14, x7, #40, #8 + eor w10, w10, w15, ror 16 + ubfx x15, x6, #0, #8 + ldr w11, [x19, x11, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x12, x7, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x7, #24, #8 + eor w11, w11, w14, ror 8 + ubfx x14, x6, #8, #8 + eor w11, w11, w15, ror 16 + ubfx x15, x6, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x19, x12, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w15, [x19, x15, LSL 2] + ubfx x8, x7, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x6, #16, #8 + eor w12, w12, w14, ror 8 + ubfx x14, x7, #56, #8 + eor w11, w12, w15, ror 16 + ubfx x15, x6, #40, #8 + ldr w8, [x19, x8, LSL 2] + ldr w14, [x19, x14, LSL 2] + ldr w13, [x19, x13, LSL 2] + ldr w15, [x19, x15, LSL 2] + eor w14, w14, w8, ror 24 + ldp x6, x7, [x21], #16 + eor w13, w13, w14, ror 24 + eor w13, w13, w15, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x6 + eor x11, x11, x7 + ubfx x6, x11, #32, #8 + ubfx x9, x11, #8, #8 + ubfx x14, x10, #48, #8 + ubfx x15, x10, #24, #8 + lsl w6, w6, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldr x13, [x19] + ldr x13, [x19, #64] + ldr x13, [x19, #128] + ldr x13, [x19, #192] + ldr x13, [x19, #256] + ldr x13, [x19, #320] + ldr x13, [x19, #384] + ldr x13, [x19, #448] + ldr x13, [x19, #512] + ldr x13, [x19, #576] + ldr x13, [x19, #640] + ldr x13, [x19, #704] + ldr x13, [x19, #768] + ldr x13, [x19, #832] + ldr x13, [x19, #896] + ldr x13, [x19, #960] + ldrb w6, [x19, x6, LSL 0] + ldrb w9, [x19, x9, LSL 0] + ldrb w14, [x19, x14, LSL 0] + ldrb w15, [x19, x15, LSL 0] + ubfx x7, x10, #0, #8 + eor w6, w6, w9, lsl 8 + ubfx x9, x11, #40, #8 + eor w6, w6, w14, lsl 16 + ubfx x14, x11, #16, #8 + eor w6, w6, w15, lsl 24 + ubfx x15, x10, #56, #8 + lsl w7, w7, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w7, [x19, x7, LSL 0] + ldrb w9, [x19, x9, LSL 0] + ldrb w14, [x19, x14, LSL 0] + ldrb w15, [x19, x15, LSL 0] + ubfx x8, x10, #32, #8 + eor w7, w7, w9, lsl 8 + ubfx x9, x10, #8, #8 + eor w7, w7, w14, lsl 16 + ubfx x14, x11, #48, #8 + eor w7, w7, w15, lsl 24 + ubfx x15, x11, #24, #8 + bfi x6, x7, #32, #32 + lsl w8, w8, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w8, [x19, x8, LSL 0] + ldrb w9, [x19, x9, LSL 0] + ldrb w14, [x19, x14, LSL 0] + ldrb w15, [x19, x15, LSL 0] + ubfx x13, x11, #56, #8 + eor w8, w8, w9, lsl 8 + ubfx x9, x11, #0, #8 + eor w8, w8, w14, lsl 16 + ubfx x14, x10, #40, #8 + eor w7, w8, w15, lsl 24 + ubfx x15, x10, #16, #8 + lsl w13, w13, #2 + lsl w9, w9, #2 + lsl w14, w14, #2 + lsl w15, w15, #2 + ldrb w13, [x19, x13, LSL 0] + ldrb w9, [x19, x9, LSL 0] + ldrb w14, [x19, x14, LSL 0] + ldrb w15, [x19, x15, LSL 0] + eor w14, w14, w13, lsl 16 + ldp x10, x11, [x21] + eor w9, w9, w14, lsl 8 + eor w9, w9, w15, lsl 16 + bfi x7, x9, #32, #32 + # XOR in Key Schedule + eor x6, x6, x10 + eor x7, x7, x11 + rev32 x6, x6 + rev32 x7, x7 + ldr x10, [x0] + ldr x11, [x0, #8] + eor x6, x6, x10 + eor x7, x7, x11 + str x6, [x1] + str x7, [x1, #8] + subs x2, x2, #16 + add x0, x0, #16 + add x1, x1, #16 + bne L_AES_GCM_encrypt_loop_block + rev32 x16, x16 + rev32 x17, x17 + stp x16, x17, [x5] + ldp x17, x19, [x29, #16] + ldp x20, x21, [x29, #32] + ldp x29, x30, [sp], #48 + ret +#ifndef __APPLE__ + .size AES_GCM_encrypt,.-AES_GCM_encrypt +#endif /* __APPLE__ */ +#endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AES_XTS +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt +.type AES_XTS_encrypt,@function +.align 2 +AES_XTS_encrypt: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt +.p2align 2 +_AES_XTS_encrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-96]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + str x26, [x29, #88] +#ifndef __APPLE__ + adrp x8, L_AES_ARM64_te + add x8, x8, :lo12:L_AES_ARM64_te +#else + adrp x8, L_AES_ARM64_te@PAGE + add x8, x8, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + mov x9, #0x87 + mov x26, x5 + ldp x21, x22, [x3] + ldp x14, x15, [x26], #16 + rev32 x21, x21 + rev32 x22, x22 + # Round: 0 - XOR in key schedule + eor x21, x21, x14 + eor x22, x22, x15 + sub w25, w7, #2 +L_AES_XTS_encrypt_loop_nr_tweak: + ubfx x14, x21, #48, #8 + ubfx x17, x21, #24, #8 + ubfx x19, x22, #8, #8 + ubfx x20, x22, #32, #8 + ldr x23, [x8] + ldr x23, [x8, #64] + ldr x23, [x8, #128] + ldr x23, [x8, #192] + ldr x23, [x8, #256] + ldr x23, [x8, #320] + ldr x23, [x8, #384] + ldr x23, [x8, #448] + ldr x23, [x8, #512] + ldr x23, [x8, #576] + ldr x23, [x8, #640] + ldr x23, [x8, #704] + ldr x23, [x8, #768] + ldr x23, [x8, #832] + ldr x23, [x8, #896] + ldr x23, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x22, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x21, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x22, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x21, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x22, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x22, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x21, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x21, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x23, x22, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x21, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x22, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x21, #40, #8 + ldr w23, [x8, x23, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w23, ror 24 + ldp x21, x22, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x21 + eor x15, x15, x22 + ubfx x21, x14, #48, #8 + ubfx x24, x14, #24, #8 + ubfx x19, x15, #8, #8 + ubfx x20, x15, #32, #8 + ldr x16, [x8] + ldr x16, [x8, #64] + ldr x16, [x8, #128] + ldr x16, [x8, #192] + ldr x16, [x8, #256] + ldr x16, [x8, #320] + ldr x16, [x8, #384] + ldr x16, [x8, #448] + ldr x16, [x8, #512] + ldr x16, [x8, #576] + ldr x16, [x8, #640] + ldr x16, [x8, #704] + ldr x16, [x8, #768] + ldr x16, [x8, #832] + ldr x16, [x8, #896] + ldr x16, [x8, #960] + ldr w21, [x8, x21, LSL 2] + ldr w24, [x8, x24, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x22, x15, #16, #8 + eor w21, w21, w24, ror 24 + ubfx x24, x14, #56, #8 + eor w21, w21, w19, ror 8 + ubfx x19, x15, #40, #8 + eor w21, w21, w20, ror 16 + ubfx x20, x14, #0, #8 + ldr w22, [x8, x22, LSL 2] + ldr w24, [x8, x24, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x23, x15, #48, #8 + eor w22, w22, w24, ror 24 + ubfx x24, x15, #24, #8 + eor w22, w22, w19, ror 8 + ubfx x19, x14, #8, #8 + eor w22, w22, w20, ror 16 + ubfx x20, x14, #32, #8 + bfi x21, x22, #32, #32 + ldr w23, [x8, x23, LSL 2] + ldr w24, [x8, x24, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x15, #0, #8 + eor w23, w23, w24, ror 24 + ubfx x24, x14, #16, #8 + eor w23, w23, w19, ror 8 + ubfx x19, x15, #56, #8 + eor w22, w23, w20, ror 16 + ubfx x20, x14, #40, #8 + ldr w16, [x8, x16, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w24, [x8, x24, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w16, ror 24 + ldp x14, x15, [x26], #16 + eor w24, w24, w19, ror 24 + eor w24, w24, w20, ror 8 + bfi x22, x24, #32, #32 + # XOR in Key Schedule + eor x21, x21, x14 + eor x22, x22, x15 + subs w25, w25, #2 + bne L_AES_XTS_encrypt_loop_nr_tweak + ubfx x14, x21, #48, #8 + ubfx x17, x21, #24, #8 + ubfx x19, x22, #8, #8 + ubfx x20, x22, #32, #8 + ldr x23, [x8] + ldr x23, [x8, #64] + ldr x23, [x8, #128] + ldr x23, [x8, #192] + ldr x23, [x8, #256] + ldr x23, [x8, #320] + ldr x23, [x8, #384] + ldr x23, [x8, #448] + ldr x23, [x8, #512] + ldr x23, [x8, #576] + ldr x23, [x8, #640] + ldr x23, [x8, #704] + ldr x23, [x8, #768] + ldr x23, [x8, #832] + ldr x23, [x8, #896] + ldr x23, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x22, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x21, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x22, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x21, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x22, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x22, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x21, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x21, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x23, x22, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x21, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x22, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x21, #40, #8 + ldr w23, [x8, x23, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w23, ror 24 + ldp x21, x22, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x21 + eor x15, x15, x22 + ubfx x21, x15, #32, #8 + ubfx x24, x15, #8, #8 + ubfx x19, x14, #48, #8 + ubfx x20, x14, #24, #8 + lsl w21, w21, #2 + lsl w24, w24, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldr x17, [x8] + ldr x17, [x8, #64] + ldr x17, [x8, #128] + ldr x17, [x8, #192] + ldr x17, [x8, #256] + ldr x17, [x8, #320] + ldr x17, [x8, #384] + ldr x17, [x8, #448] + ldr x17, [x8, #512] + ldr x17, [x8, #576] + ldr x17, [x8, #640] + ldr x17, [x8, #704] + ldr x17, [x8, #768] + ldr x17, [x8, #832] + ldr x17, [x8, #896] + ldr x17, [x8, #960] + ldrb w21, [x8, x21, LSL 0] + ldrb w24, [x8, x24, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x22, x14, #0, #8 + eor w21, w21, w24, lsl 8 + ubfx x24, x15, #40, #8 + eor w21, w21, w19, lsl 16 + ubfx x19, x15, #16, #8 + eor w21, w21, w20, lsl 24 + ubfx x20, x14, #56, #8 + lsl w22, w22, #2 + lsl w24, w24, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w22, [x8, x22, LSL 0] + ldrb w24, [x8, x24, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x23, x14, #32, #8 + eor w22, w22, w24, lsl 8 + ubfx x24, x14, #8, #8 + eor w22, w22, w19, lsl 16 + ubfx x19, x15, #48, #8 + eor w22, w22, w20, lsl 24 + ubfx x20, x15, #24, #8 + bfi x21, x22, #32, #32 + lsl w23, w23, #2 + lsl w24, w24, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w23, [x8, x23, LSL 0] + ldrb w24, [x8, x24, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x17, x15, #56, #8 + eor w23, w23, w24, lsl 8 + ubfx x24, x15, #0, #8 + eor w23, w23, w19, lsl 16 + ubfx x19, x14, #40, #8 + eor w22, w23, w20, lsl 24 + ubfx x20, x14, #16, #8 + lsl w17, w17, #2 + lsl w24, w24, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w17, [x8, x17, LSL 0] + ldrb w24, [x8, x24, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + eor w19, w19, w17, lsl 16 + ldp x14, x15, [x26] + eor w24, w24, w19, lsl 8 + eor w24, w24, w20, lsl 16 + bfi x22, x24, #32, #32 + # XOR in Key Schedule + eor x21, x21, x14 + eor x22, x22, x15 + rev32 x21, x21 + rev32 x22, x22 +L_AES_XTS_encrypt_loop_block: + mov x26, x4 + ldp x10, x11, [x0] + ldp x14, x15, [x26], #16 + eor x10, x10, x21 + eor x11, x11, x22 + rev32 x10, x10 + rev32 x11, x11 + # Round: 0 - XOR in key schedule + eor x10, x10, x14 + eor x11, x11, x15 + sub w25, w7, #2 +L_AES_XTS_encrypt_loop_nr: + ubfx x14, x10, #48, #8 + ubfx x17, x10, #24, #8 + ubfx x19, x11, #8, #8 + ubfx x20, x11, #32, #8 + ldr x12, [x8] + ldr x12, [x8, #64] + ldr x12, [x8, #128] + ldr x12, [x8, #192] + ldr x12, [x8, #256] + ldr x12, [x8, #320] + ldr x12, [x8, #384] + ldr x12, [x8, #448] + ldr x12, [x8, #512] + ldr x12, [x8, #576] + ldr x12, [x8, #640] + ldr x12, [x8, #704] + ldr x12, [x8, #768] + ldr x12, [x8, #832] + ldr x12, [x8, #896] + ldr x12, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x11, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x10, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x11, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x10, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x11, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x11, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x10, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x10, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x11, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x10, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x11, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x10, #40, #8 + ldr w12, [x8, x12, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w12, ror 24 + ldp x10, x11, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x10 + eor x15, x15, x11 + ubfx x10, x14, #48, #8 + ubfx x13, x14, #24, #8 + ubfx x19, x15, #8, #8 + ubfx x20, x15, #32, #8 + ldr x16, [x8] + ldr x16, [x8, #64] + ldr x16, [x8, #128] + ldr x16, [x8, #192] + ldr x16, [x8, #256] + ldr x16, [x8, #320] + ldr x16, [x8, #384] + ldr x16, [x8, #448] + ldr x16, [x8, #512] + ldr x16, [x8, #576] + ldr x16, [x8, #640] + ldr x16, [x8, #704] + ldr x16, [x8, #768] + ldr x16, [x8, #832] + ldr x16, [x8, #896] + ldr x16, [x8, #960] + ldr w10, [x8, x10, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x11, x15, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x14, #56, #8 + eor w10, w10, w19, ror 8 + ubfx x19, x15, #40, #8 + eor w10, w10, w20, ror 16 + ubfx x20, x14, #0, #8 + ldr w11, [x8, x11, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x15, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x15, #24, #8 + eor w11, w11, w19, ror 8 + ubfx x19, x14, #8, #8 + eor w11, w11, w20, ror 16 + ubfx x20, x14, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x8, x12, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x15, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x14, #16, #8 + eor w12, w12, w19, ror 8 + ubfx x19, x15, #56, #8 + eor w11, w12, w20, ror 16 + ubfx x20, x14, #40, #8 + ldr w16, [x8, x16, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w16, ror 24 + ldp x14, x15, [x26], #16 + eor w13, w13, w19, ror 24 + eor w13, w13, w20, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x14 + eor x11, x11, x15 + subs w25, w25, #2 + bne L_AES_XTS_encrypt_loop_nr + ubfx x14, x10, #48, #8 + ubfx x17, x10, #24, #8 + ubfx x19, x11, #8, #8 + ubfx x20, x11, #32, #8 + ldr x12, [x8] + ldr x12, [x8, #64] + ldr x12, [x8, #128] + ldr x12, [x8, #192] + ldr x12, [x8, #256] + ldr x12, [x8, #320] + ldr x12, [x8, #384] + ldr x12, [x8, #448] + ldr x12, [x8, #512] + ldr x12, [x8, #576] + ldr x12, [x8, #640] + ldr x12, [x8, #704] + ldr x12, [x8, #768] + ldr x12, [x8, #832] + ldr x12, [x8, #896] + ldr x12, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x11, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x10, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x11, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x10, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x11, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x11, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x10, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x10, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x11, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x10, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x11, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x10, #40, #8 + ldr w12, [x8, x12, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w12, ror 24 + ldp x10, x11, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x10 + eor x15, x15, x11 + ubfx x10, x15, #32, #8 + ubfx x13, x15, #8, #8 + ubfx x19, x14, #48, #8 + ubfx x20, x14, #24, #8 + lsl w10, w10, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldr x17, [x8] + ldr x17, [x8, #64] + ldr x17, [x8, #128] + ldr x17, [x8, #192] + ldr x17, [x8, #256] + ldr x17, [x8, #320] + ldr x17, [x8, #384] + ldr x17, [x8, #448] + ldr x17, [x8, #512] + ldr x17, [x8, #576] + ldr x17, [x8, #640] + ldr x17, [x8, #704] + ldr x17, [x8, #768] + ldr x17, [x8, #832] + ldr x17, [x8, #896] + ldr x17, [x8, #960] + ldrb w10, [x8, x10, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x11, x14, #0, #8 + eor w10, w10, w13, lsl 8 + ubfx x13, x15, #40, #8 + eor w10, w10, w19, lsl 16 + ubfx x19, x15, #16, #8 + eor w10, w10, w20, lsl 24 + ubfx x20, x14, #56, #8 + lsl w11, w11, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w11, [x8, x11, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x12, x14, #32, #8 + eor w11, w11, w13, lsl 8 + ubfx x13, x14, #8, #8 + eor w11, w11, w19, lsl 16 + ubfx x19, x15, #48, #8 + eor w11, w11, w20, lsl 24 + ubfx x20, x15, #24, #8 + bfi x10, x11, #32, #32 + lsl w12, w12, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w12, [x8, x12, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x17, x15, #56, #8 + eor w12, w12, w13, lsl 8 + ubfx x13, x15, #0, #8 + eor w12, w12, w19, lsl 16 + ubfx x19, x14, #40, #8 + eor w11, w12, w20, lsl 24 + ubfx x20, x14, #16, #8 + lsl w17, w17, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w17, [x8, x17, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + eor w19, w19, w17, lsl 16 + ldp x14, x15, [x26] + eor w13, w13, w19, lsl 8 + eor w13, w13, w20, lsl 16 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x14 + eor x11, x11, x15 + rev32 x10, x10 + rev32 x11, x11 + eor x10, x10, x21 + eor x11, x11, x22 + stp x10, x11, [x1] + and x19, x9, x22, asr 63 + extr x22, x22, x21, #63 + eor x21, x19, x21, lsl 1 + sub w2, w2, #16 + add x0, x0, #16 + add x1, x1, #16 + cmp w2, #16 + bge L_AES_XTS_encrypt_loop_block + cbz w2, L_AES_XTS_encrypt_done_data + mov x26, x4 + sub x1, x1, #16 + ldp x10, x11, [x1], #16 + stp x10, x11, [x6] + mov w14, w2 +L_AES_XTS_encrypt_start_byte: + ldrb w19, [x6] + ldrb w20, [x0], #1 + strb w19, [x1], #1 + strb w20, [x6], #1 + subs w14, w14, #1 + bgt L_AES_XTS_encrypt_start_byte + sub x1, x1, x2 + sub x6, x6, x2 + sub x1, x1, #16 + ldp x10, x11, [x6] + ldp x14, x15, [x26], #16 + eor x10, x10, x21 + eor x11, x11, x22 + rev32 x10, x10 + rev32 x11, x11 + # Round: 0 - XOR in key schedule + eor x10, x10, x14 + eor x11, x11, x15 + sub w25, w7, #2 +L_AES_XTS_encrypt_loop_nr_partial: + ubfx x14, x10, #48, #8 + ubfx x17, x10, #24, #8 + ubfx x19, x11, #8, #8 + ubfx x20, x11, #32, #8 + ldr x12, [x8] + ldr x12, [x8, #64] + ldr x12, [x8, #128] + ldr x12, [x8, #192] + ldr x12, [x8, #256] + ldr x12, [x8, #320] + ldr x12, [x8, #384] + ldr x12, [x8, #448] + ldr x12, [x8, #512] + ldr x12, [x8, #576] + ldr x12, [x8, #640] + ldr x12, [x8, #704] + ldr x12, [x8, #768] + ldr x12, [x8, #832] + ldr x12, [x8, #896] + ldr x12, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x11, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x10, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x11, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x10, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x11, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x11, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x10, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x10, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x11, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x10, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x11, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x10, #40, #8 + ldr w12, [x8, x12, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w12, ror 24 + ldp x10, x11, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x10 + eor x15, x15, x11 + ubfx x10, x14, #48, #8 + ubfx x13, x14, #24, #8 + ubfx x19, x15, #8, #8 + ubfx x20, x15, #32, #8 + ldr x16, [x8] + ldr x16, [x8, #64] + ldr x16, [x8, #128] + ldr x16, [x8, #192] + ldr x16, [x8, #256] + ldr x16, [x8, #320] + ldr x16, [x8, #384] + ldr x16, [x8, #448] + ldr x16, [x8, #512] + ldr x16, [x8, #576] + ldr x16, [x8, #640] + ldr x16, [x8, #704] + ldr x16, [x8, #768] + ldr x16, [x8, #832] + ldr x16, [x8, #896] + ldr x16, [x8, #960] + ldr w10, [x8, x10, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x11, x15, #16, #8 + eor w10, w10, w13, ror 24 + ubfx x13, x14, #56, #8 + eor w10, w10, w19, ror 8 + ubfx x19, x15, #40, #8 + eor w10, w10, w20, ror 16 + ubfx x20, x14, #0, #8 + ldr w11, [x8, x11, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x15, #48, #8 + eor w11, w11, w13, ror 24 + ubfx x13, x15, #24, #8 + eor w11, w11, w19, ror 8 + ubfx x19, x14, #8, #8 + eor w11, w11, w20, ror 16 + ubfx x20, x14, #32, #8 + bfi x10, x11, #32, #32 + ldr w12, [x8, x12, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x15, #0, #8 + eor w12, w12, w13, ror 24 + ubfx x13, x14, #16, #8 + eor w12, w12, w19, ror 8 + ubfx x19, x15, #56, #8 + eor w11, w12, w20, ror 16 + ubfx x20, x14, #40, #8 + ldr w16, [x8, x16, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w13, [x8, x13, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w16, ror 24 + ldp x14, x15, [x26], #16 + eor w13, w13, w19, ror 24 + eor w13, w13, w20, ror 8 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x14 + eor x11, x11, x15 + subs w25, w25, #2 + bne L_AES_XTS_encrypt_loop_nr_partial + ubfx x14, x10, #48, #8 + ubfx x17, x10, #24, #8 + ubfx x19, x11, #8, #8 + ubfx x20, x11, #32, #8 + ldr x12, [x8] + ldr x12, [x8, #64] + ldr x12, [x8, #128] + ldr x12, [x8, #192] + ldr x12, [x8, #256] + ldr x12, [x8, #320] + ldr x12, [x8, #384] + ldr x12, [x8, #448] + ldr x12, [x8, #512] + ldr x12, [x8, #576] + ldr x12, [x8, #640] + ldr x12, [x8, #704] + ldr x12, [x8, #768] + ldr x12, [x8, #832] + ldr x12, [x8, #896] + ldr x12, [x8, #960] + ldr w14, [x8, x14, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x15, x11, #16, #8 + eor w14, w14, w17, ror 24 + ubfx x17, x10, #56, #8 + eor w14, w14, w19, ror 8 + ubfx x19, x11, #40, #8 + eor w14, w14, w20, ror 16 + ubfx x20, x10, #0, #8 + ldr w15, [x8, x15, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x16, x11, #48, #8 + eor w15, w15, w17, ror 24 + ubfx x17, x11, #24, #8 + eor w15, w15, w19, ror 8 + ubfx x19, x10, #8, #8 + eor w15, w15, w20, ror 16 + ubfx x20, x10, #32, #8 + bfi x14, x15, #32, #32 + ldr w16, [x8, x16, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ubfx x12, x11, #0, #8 + eor w16, w16, w17, ror 24 + ubfx x17, x10, #16, #8 + eor w16, w16, w19, ror 8 + ubfx x19, x11, #56, #8 + eor w15, w16, w20, ror 16 + ubfx x20, x10, #40, #8 + ldr w12, [x8, x12, LSL 2] + ldr w19, [x8, x19, LSL 2] + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + eor w19, w19, w12, ror 24 + ldp x10, x11, [x26], #16 + eor w17, w17, w19, ror 24 + eor w17, w17, w20, ror 8 + bfi x15, x17, #32, #32 + # XOR in Key Schedule + eor x14, x14, x10 + eor x15, x15, x11 + ubfx x10, x15, #32, #8 + ubfx x13, x15, #8, #8 + ubfx x19, x14, #48, #8 + ubfx x20, x14, #24, #8 + lsl w10, w10, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldr x17, [x8] + ldr x17, [x8, #64] + ldr x17, [x8, #128] + ldr x17, [x8, #192] + ldr x17, [x8, #256] + ldr x17, [x8, #320] + ldr x17, [x8, #384] + ldr x17, [x8, #448] + ldr x17, [x8, #512] + ldr x17, [x8, #576] + ldr x17, [x8, #640] + ldr x17, [x8, #704] + ldr x17, [x8, #768] + ldr x17, [x8, #832] + ldr x17, [x8, #896] + ldr x17, [x8, #960] + ldrb w10, [x8, x10, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x11, x14, #0, #8 + eor w10, w10, w13, lsl 8 + ubfx x13, x15, #40, #8 + eor w10, w10, w19, lsl 16 + ubfx x19, x15, #16, #8 + eor w10, w10, w20, lsl 24 + ubfx x20, x14, #56, #8 + lsl w11, w11, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w11, [x8, x11, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x12, x14, #32, #8 + eor w11, w11, w13, lsl 8 + ubfx x13, x14, #8, #8 + eor w11, w11, w19, lsl 16 + ubfx x19, x15, #48, #8 + eor w11, w11, w20, lsl 24 + ubfx x20, x15, #24, #8 + bfi x10, x11, #32, #32 + lsl w12, w12, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w12, [x8, x12, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + ubfx x17, x15, #56, #8 + eor w12, w12, w13, lsl 8 + ubfx x13, x15, #0, #8 + eor w12, w12, w19, lsl 16 + ubfx x19, x14, #40, #8 + eor w11, w12, w20, lsl 24 + ubfx x20, x14, #16, #8 + lsl w17, w17, #2 + lsl w13, w13, #2 + lsl w19, w19, #2 + lsl w20, w20, #2 + ldrb w17, [x8, x17, LSL 0] + ldrb w13, [x8, x13, LSL 0] + ldrb w19, [x8, x19, LSL 0] + ldrb w20, [x8, x20, LSL 0] + eor w19, w19, w17, lsl 16 + ldp x14, x15, [x26] + eor w13, w13, w19, lsl 8 + eor w13, w13, w20, lsl 16 + bfi x11, x13, #32, #32 + # XOR in Key Schedule + eor x10, x10, x14 + eor x11, x11, x15 + rev32 x10, x10 + rev32 x11, x11 + eor x10, x10, x21 + eor x11, x11, x22 + stp x10, x11, [x1] +L_AES_XTS_encrypt_done_data: + ldp x17, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldr x26, [x29, #88] + ldp x29, x30, [sp], #0x60 + ret +#ifndef __APPLE__ + .size AES_XTS_encrypt,.-AES_XTS_encrypt +#endif /* __APPLE__ */ +#ifdef HAVE_AES_DECRYPT +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt +.type AES_XTS_decrypt,@function +.align 2 +AES_XTS_decrypt: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt +.p2align 2 +_AES_XTS_decrypt: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-112]! + add x29, sp, #0 + stp x17, x19, [x29, #24] + stp x20, x21, [x29, #40] + stp x22, x23, [x29, #56] + stp x24, x25, [x29, #72] + stp x26, x27, [x29, #88] + str x28, [x29, #104] +#ifndef __APPLE__ + adrp x8, L_AES_ARM64_td + add x8, x8, :lo12:L_AES_ARM64_td +#else + adrp x8, L_AES_ARM64_td@PAGE + add x8, x8, :lo12:L_AES_ARM64_td@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x9, L_AES_ARM64_td4 + add x9, x9, :lo12:L_AES_ARM64_td4 +#else + adrp x9, L_AES_ARM64_td4@PAGE + add x9, x9, :lo12:L_AES_ARM64_td4@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x10, L_AES_ARM64_te + add x10, x10, :lo12:L_AES_ARM64_te +#else + adrp x10, L_AES_ARM64_te@PAGE + add x10, x10, :lo12:L_AES_ARM64_te@PAGEOFF +#endif /* __APPLE__ */ + ands w11, w2, #15 + cset w11, ne + lsl w11, w11, #4 + sub w2, w2, w11 + mov x11, #0x87 + mov x28, x5 + ldp x23, x24, [x3] + ldp x16, x17, [x28], #16 + rev32 x23, x23 + rev32 x24, x24 + # Round: 0 - XOR in key schedule + eor x23, x23, x16 + eor x24, x24, x17 + sub w27, w7, #2 +L_AES_XTS_decrypt_loop_nr_tweak: + ubfx x16, x23, #48, #8 + ubfx x20, x23, #24, #8 + ubfx x21, x24, #8, #8 + ubfx x22, x24, #32, #8 + ldr x25, [x10] + ldr x25, [x10, #64] + ldr x25, [x10, #128] + ldr x25, [x10, #192] + ldr x25, [x10, #256] + ldr x25, [x10, #320] + ldr x25, [x10, #384] + ldr x25, [x10, #448] + ldr x25, [x10, #512] + ldr x25, [x10, #576] + ldr x25, [x10, #640] + ldr x25, [x10, #704] + ldr x25, [x10, #768] + ldr x25, [x10, #832] + ldr x25, [x10, #896] + ldr x25, [x10, #960] + ldr w16, [x10, x16, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x17, x24, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x23, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x24, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x23, #0, #8 + ldr w17, [x10, x17, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x19, x24, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x24, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x23, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x23, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x10, x19, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x25, x24, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x23, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x24, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x23, #40, #8 + ldr w25, [x10, x25, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w22, [x10, x22, LSL 2] + eor w21, w21, w25, ror 24 + ldp x23, x24, [x28], #16 + eor w20, w20, w21, ror 24 + eor w20, w20, w22, ror 8 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x23 + eor x17, x17, x24 + ubfx x23, x16, #48, #8 + ubfx x26, x16, #24, #8 + ubfx x21, x17, #8, #8 + ubfx x22, x17, #32, #8 + ldr x19, [x10] + ldr x19, [x10, #64] + ldr x19, [x10, #128] + ldr x19, [x10, #192] + ldr x19, [x10, #256] + ldr x19, [x10, #320] + ldr x19, [x10, #384] + ldr x19, [x10, #448] + ldr x19, [x10, #512] + ldr x19, [x10, #576] + ldr x19, [x10, #640] + ldr x19, [x10, #704] + ldr x19, [x10, #768] + ldr x19, [x10, #832] + ldr x19, [x10, #896] + ldr x19, [x10, #960] + ldr w23, [x10, x23, LSL 2] + ldr w26, [x10, x26, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x24, x17, #16, #8 + eor w23, w23, w26, ror 24 + ubfx x26, x16, #56, #8 + eor w23, w23, w21, ror 8 + ubfx x21, x17, #40, #8 + eor w23, w23, w22, ror 16 + ubfx x22, x16, #0, #8 + ldr w24, [x10, x24, LSL 2] + ldr w26, [x10, x26, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x25, x17, #48, #8 + eor w24, w24, w26, ror 24 + ubfx x26, x17, #24, #8 + eor w24, w24, w21, ror 8 + ubfx x21, x16, #8, #8 + eor w24, w24, w22, ror 16 + ubfx x22, x16, #32, #8 + bfi x23, x24, #32, #32 + ldr w25, [x10, x25, LSL 2] + ldr w26, [x10, x26, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x19, x17, #0, #8 + eor w25, w25, w26, ror 24 + ubfx x26, x16, #16, #8 + eor w25, w25, w21, ror 8 + ubfx x21, x17, #56, #8 + eor w24, w25, w22, ror 16 + ubfx x22, x16, #40, #8 + ldr w19, [x10, x19, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w26, [x10, x26, LSL 2] + ldr w22, [x10, x22, LSL 2] + eor w21, w21, w19, ror 24 + ldp x16, x17, [x28], #16 + eor w26, w26, w21, ror 24 + eor w26, w26, w22, ror 8 + bfi x24, x26, #32, #32 + # XOR in Key Schedule + eor x23, x23, x16 + eor x24, x24, x17 + subs w27, w27, #2 + bne L_AES_XTS_decrypt_loop_nr_tweak + ubfx x16, x23, #48, #8 + ubfx x20, x23, #24, #8 + ubfx x21, x24, #8, #8 + ubfx x22, x24, #32, #8 + ldr x25, [x10] + ldr x25, [x10, #64] + ldr x25, [x10, #128] + ldr x25, [x10, #192] + ldr x25, [x10, #256] + ldr x25, [x10, #320] + ldr x25, [x10, #384] + ldr x25, [x10, #448] + ldr x25, [x10, #512] + ldr x25, [x10, #576] + ldr x25, [x10, #640] + ldr x25, [x10, #704] + ldr x25, [x10, #768] + ldr x25, [x10, #832] + ldr x25, [x10, #896] + ldr x25, [x10, #960] + ldr w16, [x10, x16, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x17, x24, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x23, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x24, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x23, #0, #8 + ldr w17, [x10, x17, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x19, x24, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x24, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x23, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x23, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x10, x19, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w22, [x10, x22, LSL 2] + ubfx x25, x24, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x23, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x24, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x23, #40, #8 + ldr w25, [x10, x25, LSL 2] + ldr w21, [x10, x21, LSL 2] + ldr w20, [x10, x20, LSL 2] + ldr w22, [x10, x22, LSL 2] + eor w21, w21, w25, ror 24 + ldp x23, x24, [x28], #16 + eor w20, w20, w21, ror 24 + eor w20, w20, w22, ror 8 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x23 + eor x17, x17, x24 + ubfx x23, x17, #32, #8 + ubfx x26, x17, #8, #8 + ubfx x21, x16, #48, #8 + ubfx x22, x16, #24, #8 + lsl w23, w23, #2 + lsl w26, w26, #2 + lsl w21, w21, #2 + lsl w22, w22, #2 + ldr x20, [x10] + ldr x20, [x10, #64] + ldr x20, [x10, #128] + ldr x20, [x10, #192] + ldr x20, [x10, #256] + ldr x20, [x10, #320] + ldr x20, [x10, #384] + ldr x20, [x10, #448] + ldr x20, [x10, #512] + ldr x20, [x10, #576] + ldr x20, [x10, #640] + ldr x20, [x10, #704] + ldr x20, [x10, #768] + ldr x20, [x10, #832] + ldr x20, [x10, #896] + ldr x20, [x10, #960] + ldrb w23, [x10, x23, LSL 0] + ldrb w26, [x10, x26, LSL 0] + ldrb w21, [x10, x21, LSL 0] + ldrb w22, [x10, x22, LSL 0] + ubfx x24, x16, #0, #8 + eor w23, w23, w26, lsl 8 + ubfx x26, x17, #40, #8 + eor w23, w23, w21, lsl 16 + ubfx x21, x17, #16, #8 + eor w23, w23, w22, lsl 24 + ubfx x22, x16, #56, #8 + lsl w24, w24, #2 + lsl w26, w26, #2 + lsl w21, w21, #2 + lsl w22, w22, #2 + ldrb w24, [x10, x24, LSL 0] + ldrb w26, [x10, x26, LSL 0] + ldrb w21, [x10, x21, LSL 0] + ldrb w22, [x10, x22, LSL 0] + ubfx x25, x16, #32, #8 + eor w24, w24, w26, lsl 8 + ubfx x26, x16, #8, #8 + eor w24, w24, w21, lsl 16 + ubfx x21, x17, #48, #8 + eor w24, w24, w22, lsl 24 + ubfx x22, x17, #24, #8 + bfi x23, x24, #32, #32 + lsl w25, w25, #2 + lsl w26, w26, #2 + lsl w21, w21, #2 + lsl w22, w22, #2 + ldrb w25, [x10, x25, LSL 0] + ldrb w26, [x10, x26, LSL 0] + ldrb w21, [x10, x21, LSL 0] + ldrb w22, [x10, x22, LSL 0] + ubfx x20, x17, #56, #8 + eor w25, w25, w26, lsl 8 + ubfx x26, x17, #0, #8 + eor w25, w25, w21, lsl 16 + ubfx x21, x16, #40, #8 + eor w24, w25, w22, lsl 24 + ubfx x22, x16, #16, #8 + lsl w20, w20, #2 + lsl w26, w26, #2 + lsl w21, w21, #2 + lsl w22, w22, #2 + ldrb w20, [x10, x20, LSL 0] + ldrb w26, [x10, x26, LSL 0] + ldrb w21, [x10, x21, LSL 0] + ldrb w22, [x10, x22, LSL 0] + eor w21, w21, w20, lsl 16 + ldp x16, x17, [x28] + eor w26, w26, w21, lsl 8 + eor w26, w26, w22, lsl 16 + bfi x24, x26, #32, #32 + # XOR in Key Schedule + eor x23, x23, x16 + eor x24, x24, x17 + rev32 x23, x23 + rev32 x24, x24 + cmp w2, #16 + blt L_AES_XTS_decrypt_start_partail +L_AES_XTS_decrypt_loop_block: + mov x28, x4 + ldp x12, x13, [x0] + ldp x16, x17, [x28], #16 + eor x12, x12, x23 + eor x13, x13, x24 + rev32 x12, x12 + rev32 x13, x13 + # Round: 0 - XOR in key schedule + eor x12, x12, x16 + eor x13, x13, x17 + sub w27, w7, #2 +L_AES_XTS_decrypt_loop_nr: + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x17, #48, #8 + ubfx x15, x16, #24, #8 + ubfx x21, x17, #8, #8 + ubfx x22, x16, #32, #8 + ldr x19, [x8] + ldr x19, [x8, #64] + ldr x19, [x8, #128] + ldr x19, [x8, #192] + ldr x19, [x8, #256] + ldr x19, [x8, #320] + ldr x19, [x8, #384] + ldr x19, [x8, #448] + ldr x19, [x8, #512] + ldr x19, [x8, #576] + ldr x19, [x8, #640] + ldr x19, [x8, #704] + ldr x19, [x8, #768] + ldr x19, [x8, #832] + ldr x19, [x8, #896] + ldr x19, [x8, #960] + ldr w12, [x8, x12, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x13, x16, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x16, #56, #8 + eor w12, w12, w21, ror 8 + ubfx x21, x17, #40, #8 + eor w12, w12, w22, ror 16 + ubfx x22, x17, #0, #8 + ldr w13, [x8, x13, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x16, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x17, #24, #8 + eor w13, w13, w21, ror 8 + ubfx x21, x16, #8, #8 + eor w13, w13, w22, ror 16 + ubfx x22, x17, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x8, x14, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x16, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x17, #16, #8 + eor w14, w14, w21, ror 8 + ubfx x21, x17, #56, #8 + eor w13, w14, w22, ror 16 + ubfx x22, x16, #40, #8 + ldr w19, [x8, x19, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w19, ror 24 + ldp x16, x17, [x28], #16 + eor w15, w15, w22, ror 8 + eor w15, w15, w21, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + subs w27, w27, #2 + bne L_AES_XTS_decrypt_loop_nr + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x16, #32, #8 + ubfx x15, x17, #8, #8 + ubfx x21, x17, #48, #8 + ubfx x22, x16, #24, #8 + ldr x20, [x9] + ldr x20, [x9, #64] + ldr x20, [x9, #128] + ldr x20, [x9, #192] + ldr x20, [x9, #256] + ldr x20, [x9, #320] + ldr x20, [x9, #384] + ldr x20, [x9, #448] + ldr x20, [x9, #512] + ldr x20, [x9, #576] + ldr x20, [x9, #640] + ldr x20, [x9, #704] + ldr x20, [x9, #768] + ldr x20, [x9, #832] + ldr x20, [x9, #896] + ldr x20, [x9, #960] + ldrb w12, [x9, x12, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ubfx x13, x17, #0, #8 + eor w12, w12, w15, lsl 8 + ubfx x15, x17, #40, #8 + eor w12, w12, w21, lsl 16 + ubfx x21, x16, #16, #8 + eor w12, w12, w22, lsl 24 + ubfx x22, x16, #56, #8 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w13, [x9, x13, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x14, x17, #32, #8 + eor w13, w13, w15, lsl 8 + ubfx x15, x16, #8, #8 + eor w13, w13, w21, lsl 16 + ubfx x21, x16, #48, #8 + eor w13, w13, w22, lsl 24 + ubfx x22, x17, #24, #8 + bfi x12, x13, #32, #32 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w14, [x9, x14, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x20, x17, #56, #8 + eor w14, w14, w15, lsl 8 + ubfx x15, x16, #0, #8 + eor w14, w14, w21, lsl 16 + ubfx x21, x16, #40, #8 + eor w13, w14, w22, lsl 24 + ubfx x22, x17, #16, #8 + ldrb w20, [x9, x20, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + eor w21, w21, w20, lsl 16 + ldp x16, x17, [x28] + eor w15, w15, w21, lsl 8 + eor w15, w15, w22, lsl 16 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + rev32 x12, x12 + rev32 x13, x13 + eor x12, x12, x23 + eor x13, x13, x24 + stp x12, x13, [x1] + and x21, x11, x24, asr 63 + extr x24, x24, x23, #63 + eor x23, x21, x23, lsl 1 + sub w2, w2, #16 + add x0, x0, #16 + add x1, x1, #16 + cmp w2, #16 + bge L_AES_XTS_decrypt_loop_block + cbz w2, L_AES_XTS_decrypt_done_data +L_AES_XTS_decrypt_start_partail: + and x21, x11, x24, asr 63 + extr x26, x24, x23, #63 + eor x25, x21, x23, lsl 1 + mov x28, x4 + ldp x12, x13, [x0], #16 + ldp x16, x17, [x28], #16 + eor x12, x12, x25 + eor x13, x13, x26 + rev32 x12, x12 + rev32 x13, x13 + # Round: 0 - XOR in key schedule + eor x12, x12, x16 + eor x13, x13, x17 + sub w27, w7, #2 +L_AES_XTS_decrypt_loop_nr_partial_1: + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x17, #48, #8 + ubfx x15, x16, #24, #8 + ubfx x21, x17, #8, #8 + ubfx x22, x16, #32, #8 + ldr x19, [x8] + ldr x19, [x8, #64] + ldr x19, [x8, #128] + ldr x19, [x8, #192] + ldr x19, [x8, #256] + ldr x19, [x8, #320] + ldr x19, [x8, #384] + ldr x19, [x8, #448] + ldr x19, [x8, #512] + ldr x19, [x8, #576] + ldr x19, [x8, #640] + ldr x19, [x8, #704] + ldr x19, [x8, #768] + ldr x19, [x8, #832] + ldr x19, [x8, #896] + ldr x19, [x8, #960] + ldr w12, [x8, x12, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x13, x16, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x16, #56, #8 + eor w12, w12, w21, ror 8 + ubfx x21, x17, #40, #8 + eor w12, w12, w22, ror 16 + ubfx x22, x17, #0, #8 + ldr w13, [x8, x13, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x16, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x17, #24, #8 + eor w13, w13, w21, ror 8 + ubfx x21, x16, #8, #8 + eor w13, w13, w22, ror 16 + ubfx x22, x17, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x8, x14, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x16, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x17, #16, #8 + eor w14, w14, w21, ror 8 + ubfx x21, x17, #56, #8 + eor w13, w14, w22, ror 16 + ubfx x22, x16, #40, #8 + ldr w19, [x8, x19, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w19, ror 24 + ldp x16, x17, [x28], #16 + eor w15, w15, w22, ror 8 + eor w15, w15, w21, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + subs w27, w27, #2 + bne L_AES_XTS_decrypt_loop_nr_partial_1 + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x16, #32, #8 + ubfx x15, x17, #8, #8 + ubfx x21, x17, #48, #8 + ubfx x22, x16, #24, #8 + ldr x20, [x9] + ldr x20, [x9, #64] + ldr x20, [x9, #128] + ldr x20, [x9, #192] + ldr x20, [x9, #256] + ldr x20, [x9, #320] + ldr x20, [x9, #384] + ldr x20, [x9, #448] + ldr x20, [x9, #512] + ldr x20, [x9, #576] + ldr x20, [x9, #640] + ldr x20, [x9, #704] + ldr x20, [x9, #768] + ldr x20, [x9, #832] + ldr x20, [x9, #896] + ldr x20, [x9, #960] + ldrb w12, [x9, x12, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ubfx x13, x17, #0, #8 + eor w12, w12, w15, lsl 8 + ubfx x15, x17, #40, #8 + eor w12, w12, w21, lsl 16 + ubfx x21, x16, #16, #8 + eor w12, w12, w22, lsl 24 + ubfx x22, x16, #56, #8 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w13, [x9, x13, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x14, x17, #32, #8 + eor w13, w13, w15, lsl 8 + ubfx x15, x16, #8, #8 + eor w13, w13, w21, lsl 16 + ubfx x21, x16, #48, #8 + eor w13, w13, w22, lsl 24 + ubfx x22, x17, #24, #8 + bfi x12, x13, #32, #32 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w14, [x9, x14, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x20, x17, #56, #8 + eor w14, w14, w15, lsl 8 + ubfx x15, x16, #0, #8 + eor w14, w14, w21, lsl 16 + ubfx x21, x16, #40, #8 + eor w13, w14, w22, lsl 24 + ubfx x22, x17, #16, #8 + ldrb w20, [x9, x20, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + eor w21, w21, w20, lsl 16 + ldp x16, x17, [x28] + eor w15, w15, w21, lsl 8 + eor w15, w15, w22, lsl 16 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + rev32 x12, x12 + rev32 x13, x13 + eor x12, x12, x25 + eor x13, x13, x26 + stp x12, x13, [x6] + add x1, x1, #16 + mov w16, w2 +L_AES_XTS_decrypt_start_byte: + ldrb w21, [x6] + ldrb w22, [x0], #1 + strb w21, [x1], #1 + strb w22, [x6], #1 + subs w16, w16, #1 + bgt L_AES_XTS_decrypt_start_byte + sub x1, x1, x2 + sub x6, x6, x2 + sub x1, x1, #16 + mov x28, x4 + ldp x12, x13, [x6] + ldp x16, x17, [x28], #16 + eor x12, x12, x23 + eor x13, x13, x24 + rev32 x12, x12 + rev32 x13, x13 + # Round: 0 - XOR in key schedule + eor x12, x12, x16 + eor x13, x13, x17 + sub w27, w7, #2 +L_AES_XTS_decrypt_loop_nr_partial_2: + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x17, #48, #8 + ubfx x15, x16, #24, #8 + ubfx x21, x17, #8, #8 + ubfx x22, x16, #32, #8 + ldr x19, [x8] + ldr x19, [x8, #64] + ldr x19, [x8, #128] + ldr x19, [x8, #192] + ldr x19, [x8, #256] + ldr x19, [x8, #320] + ldr x19, [x8, #384] + ldr x19, [x8, #448] + ldr x19, [x8, #512] + ldr x19, [x8, #576] + ldr x19, [x8, #640] + ldr x19, [x8, #704] + ldr x19, [x8, #768] + ldr x19, [x8, #832] + ldr x19, [x8, #896] + ldr x19, [x8, #960] + ldr w12, [x8, x12, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x13, x16, #16, #8 + eor w12, w12, w15, ror 24 + ubfx x15, x16, #56, #8 + eor w12, w12, w21, ror 8 + ubfx x21, x17, #40, #8 + eor w12, w12, w22, ror 16 + ubfx x22, x17, #0, #8 + ldr w13, [x8, x13, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x16, #48, #8 + eor w13, w13, w15, ror 24 + ubfx x15, x17, #24, #8 + eor w13, w13, w21, ror 8 + ubfx x21, x16, #8, #8 + eor w13, w13, w22, ror 16 + ubfx x22, x17, #32, #8 + bfi x12, x13, #32, #32 + ldr w14, [x8, x14, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x16, #0, #8 + eor w14, w14, w15, ror 24 + ubfx x15, x17, #16, #8 + eor w14, w14, w21, ror 8 + ubfx x21, x17, #56, #8 + eor w13, w14, w22, ror 16 + ubfx x22, x16, #40, #8 + ldr w19, [x8, x19, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w15, [x8, x15, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w19, ror 24 + ldp x16, x17, [x28], #16 + eor w15, w15, w22, ror 8 + eor w15, w15, w21, ror 24 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + subs w27, w27, #2 + bne L_AES_XTS_decrypt_loop_nr_partial_2 + ubfx x16, x13, #48, #8 + ubfx x20, x12, #24, #8 + ubfx x21, x13, #8, #8 + ubfx x22, x12, #32, #8 + ldr x14, [x8] + ldr x14, [x8, #64] + ldr x14, [x8, #128] + ldr x14, [x8, #192] + ldr x14, [x8, #256] + ldr x14, [x8, #320] + ldr x14, [x8, #384] + ldr x14, [x8, #448] + ldr x14, [x8, #512] + ldr x14, [x8, #576] + ldr x14, [x8, #640] + ldr x14, [x8, #704] + ldr x14, [x8, #768] + ldr x14, [x8, #832] + ldr x14, [x8, #896] + ldr x14, [x8, #960] + ldr w16, [x8, x16, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x17, x12, #16, #8 + eor w16, w16, w20, ror 24 + ubfx x20, x12, #56, #8 + eor w16, w16, w21, ror 8 + ubfx x21, x13, #40, #8 + eor w16, w16, w22, ror 16 + ubfx x22, x13, #0, #8 + ldr w17, [x8, x17, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x19, x12, #48, #8 + eor w17, w17, w20, ror 24 + ubfx x20, x13, #24, #8 + eor w17, w17, w21, ror 8 + ubfx x21, x12, #8, #8 + eor w17, w17, w22, ror 16 + ubfx x22, x13, #32, #8 + bfi x16, x17, #32, #32 + ldr w19, [x8, x19, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w22, [x8, x22, LSL 2] + ubfx x14, x12, #0, #8 + eor w19, w19, w20, ror 24 + ubfx x20, x13, #16, #8 + eor w19, w19, w21, ror 8 + ubfx x21, x13, #56, #8 + eor w17, w19, w22, ror 16 + ubfx x22, x12, #40, #8 + ldr w14, [x8, x14, LSL 2] + ldr w21, [x8, x21, LSL 2] + ldr w20, [x8, x20, LSL 2] + ldr w22, [x8, x22, LSL 2] + eor w21, w21, w14, ror 24 + ldp x12, x13, [x28], #16 + eor w20, w20, w22, ror 8 + eor w20, w20, w21, ror 24 + bfi x17, x20, #32, #32 + # XOR in Key Schedule + eor x16, x16, x12 + eor x17, x17, x13 + ubfx x12, x16, #32, #8 + ubfx x15, x17, #8, #8 + ubfx x21, x17, #48, #8 + ubfx x22, x16, #24, #8 + ldr x20, [x9] + ldr x20, [x9, #64] + ldr x20, [x9, #128] + ldr x20, [x9, #192] + ldr x20, [x9, #256] + ldr x20, [x9, #320] + ldr x20, [x9, #384] + ldr x20, [x9, #448] + ldr x20, [x9, #512] + ldr x20, [x9, #576] + ldr x20, [x9, #640] + ldr x20, [x9, #704] + ldr x20, [x9, #768] + ldr x20, [x9, #832] + ldr x20, [x9, #896] + ldr x20, [x9, #960] + ldrb w12, [x9, x12, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ubfx x13, x17, #0, #8 + eor w12, w12, w15, lsl 8 + ubfx x15, x17, #40, #8 + eor w12, w12, w21, lsl 16 + ubfx x21, x16, #16, #8 + eor w12, w12, w22, lsl 24 + ubfx x22, x16, #56, #8 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w13, [x9, x13, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x14, x17, #32, #8 + eor w13, w13, w15, lsl 8 + ubfx x15, x16, #8, #8 + eor w13, w13, w21, lsl 16 + ubfx x21, x16, #48, #8 + eor w13, w13, w22, lsl 24 + ubfx x22, x17, #24, #8 + bfi x12, x13, #32, #32 + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + ldrb w14, [x9, x14, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ubfx x20, x17, #56, #8 + eor w14, w14, w15, lsl 8 + ubfx x15, x16, #0, #8 + eor w14, w14, w21, lsl 16 + ubfx x21, x16, #40, #8 + eor w13, w14, w22, lsl 24 + ubfx x22, x17, #16, #8 + ldrb w20, [x9, x20, LSL 0] + ldrb w21, [x9, x21, LSL 0] + ldrb w15, [x9, x15, LSL 0] + ldrb w22, [x9, x22, LSL 0] + eor w21, w21, w20, lsl 16 + ldp x16, x17, [x28] + eor w15, w15, w21, lsl 8 + eor w15, w15, w22, lsl 16 + bfi x13, x15, #32, #32 + # XOR in Key Schedule + eor x12, x12, x16 + eor x13, x13, x17 + rev32 x12, x12 + rev32 x13, x13 + eor x12, x12, x23 + eor x13, x13, x24 + stp x12, x13, [x1] +L_AES_XTS_decrypt_done_data: + ldp x17, x19, [x29, #24] + ldp x20, x21, [x29, #40] + ldp x22, x23, [x29, #56] + ldp x24, x25, [x29, #72] + ldp x26, x27, [x29, #88] + ldr x28, [x29, #104] + ldp x29, x30, [sp], #0x70 + ret +#ifndef __APPLE__ + .size AES_XTS_decrypt,.-AES_XTS_decrypt +#endif /* __APPLE__ */ +#endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_XTS */ +#endif /* !WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP */ #endif /* !defined(NO_AES) && defined(WOLFSSL_ARMASM) */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c index e76ad8e1a..c8d1a9633 100644 --- a/wolfcrypt/src/port/arm/armv8-aes-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-aes-asm_c.c @@ -43562,6 +43562,13287 @@ void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz, #endif /* HAVE_AES_DECRYPT */ #endif /* WOLFSSL_AES_XTS */ #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#ifndef WOLFSSL_ARMASM_NO_NEON +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +static const word8 L_AES_ARM64_NEON_te[] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, + 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, + 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, + 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, + 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, + 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, + 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, + 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, + 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, + 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, + 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +static const word8 L_AES_ARM64_NEON_shift_rows_shuffle[] = { + 0x0c, 0x09, 0x06, 0x03, 0x00, 0x0d, 0x0a, 0x07, + 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, +}; + +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +void AES_invert_key_NEON(unsigned char* ks, word32 rounds); +void AES_invert_key_NEON(unsigned char* ks, word32 rounds) +{ + __asm__ __volatile__ ( + "add x3, %x[ks], %x[rounds], lsl 4\n\t" + "mov x2, %x[ks]\n\t" + "mov w4, %w[rounds]\n\t" + "\n" + "L_AES_invert_key_NEON_loop_%=: \n\t" + "ld1 {v0.2d}, [x2]\n\t" + "ld1 {v1.2d}, [x3]\n\t" + "st1 {v0.2d}, [x3]\n\t" + "st1 {v1.2d}, [x2], #16\n\t" + "subs w4, w4, #2\n\t" + "sub x3, x3, #16\n\t" + "b.ne L_AES_invert_key_NEON_loop_%=\n\t" + "movi v2.16b, #27\n\t" + "add x2, %x[ks], #16\n\t" + "sub w4, %w[rounds], #1\n\t" + "\n" + "L_AES_invert_key_NEON_mix_loop_%=: \n\t" + "ld1 {v0.2d}, [x2]\n\t" + "sshr v5.16b, v0.16b, #7\n\t" + "ushr v6.16b, v0.16b, #6\n\t" + "ushr v3.16b, v0.16b, #5\n\t" + "and v5.16b, v5.16b, v2.16b\n\t" + "pmul v6.16b, v6.16b, v2.16b\n\t" + "pmul v3.16b, v3.16b, v2.16b\n\t" + "shl v4.16b, v0.16b, #1\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "shl v4.16b, v0.16b, #3\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "eor v6.16b, v6.16b, v4.16b\n\t" + "eor v4.16b, v5.16b, v3.16b\n\t" + "eor v3.16b, v3.16b, v0.16b\n\t" + "eor v5.16b, v6.16b, v3.16b\n\t" + "eor v6.16b, v6.16b, v4.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "shl v0.4s, v4.4s, #8\n\t" + "rev32 v5.8h, v5.8h\n\t" + "sri v0.4s, v4.4s, #24\n\t" + "eor v0.16b, v0.16b, v6.16b\n\t" + "shl v4.4s, v3.4s, #24\n\t" + "eor v0.16b, v0.16b, v5.16b\n\t" + "sri v4.4s, v3.4s, #8\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "st1 {v0.2d}, [x2], #16\n\t" + "subs w4, w4, #1\n\t" + "b.ne L_AES_invert_key_NEON_mix_loop_%=\n\t" + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : + : "memory", "cc", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", + "v6" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +static const word32 L_AES_ARM64_NEON_rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1b000000, 0x36000000, +}; + +void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, + unsigned char* ks); +void AES_set_encrypt_key_NEON(const unsigned char* key, word32 len, + unsigned char* ks) +{ + const word32* rcon = L_AES_ARM64_NEON_rcon; + const word8* te = L_AES_ARM64_NEON_te; + __asm__ __volatile__ ( + "ld1 {v6.16b, v7.16b, v8.16b, v9.16b}, [%[te]], #0x40\n\t" + "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[te]], #0x40\n\t" + "ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%[te]], #0x40\n\t" + "ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%[te]]\n\t" + "movi v2.16b, #0x40\n\t" + "movi v3.16b, #0x80\n\t" + "movi v4.16b, #0xc0\n\t" + "movi v5.16b, #27\n\t" + "eor v26.16b, v26.16b, v26.16b\n\t" + "cmp %w[len], #0x80\n\t" + "b.eq L_AES_set_encrypt_key_NEON_start_128_%=\n\t" + "cmp %w[len], #0xc0\n\t" + "b.eq L_AES_set_encrypt_key_NEON_start_192_%=\n\t" + "ld1 {v0.16b}, [%x[key]], #16\n\t" + "ld1 {v1.16b}, [%x[key]]\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "st1 {v1.2d}, [%x[ks]], #16\n\t" + "mov x3, #6\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_loop_256_%=: \n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "eor v22.16b, v0.16b, v2.16b\n\t" + "eor v23.16b, v0.16b, v3.16b\n\t" + "eor v24.16b, v0.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "eor v1.16b, v1.16b, v25.16b\n\t" + "dup v22.4s, v1.s[0]\n\t" + "dup v23.2s, v1.s[1]\n\t" + "dup v24.2s, v1.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v1.16b, v1.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v1.16b, v1.16b, v23.16b\n\t" + "eor v1.16b, v1.16b, v24.16b\n\t" + "st1 {v1.2d}, [%x[ks]], #16\n\t" + "subs x3, x3, #1\n\t" + "b.ne L_AES_set_encrypt_key_NEON_loop_256_%=\n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "b L_AES_set_encrypt_key_NEON_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_start_192_%=: \n\t" + "ld1 {v0.16b}, [%x[key]], #16\n\t" + "ld1 {v1.8b}, [%x[key]]\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.8b, v1.8b\n\t" + "st1 {v0.16b}, [%x[ks]], #16\n\t" + "st1 {v1.8b}, [%x[ks]], #8\n\t" + "ext v1.16b, v1.16b, v1.16b, #8\n\t" + "mov x3, #7\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_loop_192_%=: \n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "mov v23.16b, v26.16b\n\t" + "mov v23.s[2], v0.s[3]\n\t" + "eor v1.16b, v1.16b, v23.16b\n\t" + "mov v23.16b, v26.16b\n\t" + "mov v23.s[3], v1.s[2]\n\t" + "eor v1.16b, v1.16b, v23.16b\n\t" + "st1 {v1.d}[1], [%x[ks]], #8\n\t" + "subs x3, x3, #1\n\t" + "b.ne L_AES_set_encrypt_key_NEON_loop_192_%=\n\t" + "eor v22.16b, v1.16b, v2.16b\n\t" + "eor v23.16b, v1.16b, v3.16b\n\t" + "eor v24.16b, v1.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v1.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "b L_AES_set_encrypt_key_NEON_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_start_128_%=: \n\t" + "ld1 {v0.16b}, [%x[key]]\n\t" + "rev32 v0.16b, v0.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "mov x3, #10\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_loop_128_%=: \n\t" + "eor v22.16b, v0.16b, v2.16b\n\t" + "eor v23.16b, v0.16b, v3.16b\n\t" + "eor v24.16b, v0.16b, v4.16b\n\t" + "tbl v25.16b, {v6.16b, v7.16b, v8.16b, v9.16b}, v0.16b\n\t" + "tbl v22.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v22.16b\n\t" + "tbl v23.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v23.16b\n\t" + "tbl v24.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v24.16b\n\t" + "orr v25.16b, v25.16b, v22.16b\n\t" + "orr v23.16b, v23.16b, v24.16b\n\t" + "orr v25.16b, v25.16b, v23.16b\n\t" + "ext v25.16b, v25.16b, v26.16b, #12\n\t" + "shl v22.4s, v25.4s, #8\n\t" + "sri v22.4s, v25.4s, #24\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ld1r {v25.4s}, [%[rcon]], #4\n\t" + "dup v22.4s, v0.s[0]\n\t" + "dup v23.2s, v0.s[1]\n\t" + "dup v24.2s, v0.s[2]\n\t" + "ext v22.16b, v26.16b, v22.16b, #12\n\t" + "ext v23.16b, v26.16b, v23.16b, #8\n\t" + "eor v0.16b, v0.16b, v22.16b\n\t" + "ext v24.16b, v26.16b, v24.16b, #4\n\t" + "eor v0.16b, v0.16b, v23.16b\n\t" + "eor v0.16b, v0.16b, v24.16b\n\t" + "eor v0.16b, v0.16b, v25.16b\n\t" + "st1 {v0.2d}, [%x[ks]], #16\n\t" + "subs x3, x3, #1\n\t" + "b.ne L_AES_set_encrypt_key_NEON_loop_128_%=\n\t" + "\n" + "L_AES_set_encrypt_key_NEON_end_%=: \n\t" + : [len] "+r" (len), [ks] "+r" (ks) + : [key] "r" (key), [rcon] "r" (rcon), [te] "r" (te) + : "memory", "cc", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26" + ); +} + +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_ECB) +void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + __asm__ __volatile__ ( + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "cmp %x[len], #0x40\n\t" + "b.lt L_AES_ECB_encrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_loop_4_%=: \n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "movi v4.16b, #27\n\t" + "and v8.16b, v8.16b, v4.16b\n\t" + "and v9.16b, v9.16b, v4.16b\n\t" + "and v10.16b, v10.16b, v4.16b\n\t" + "and v11.16b, v11.16b, v4.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "eor v6.16b, v10.16b, v2.16b\n\t" + "eor v7.16b, v11.16b, v3.16b\n\t" + "shl v12.4s, v4.4s, #8\n\t" + "shl v13.4s, v5.4s, #8\n\t" + "shl v14.4s, v6.4s, #8\n\t" + "shl v15.4s, v7.4s, #8\n\t" + "sri v12.4s, v4.4s, #24\n\t" + "sri v13.4s, v5.4s, #24\n\t" + "sri v14.4s, v6.4s, #24\n\t" + "sri v15.4s, v7.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "shl v6.4s, v2.4s, #24\n\t" + "shl v7.4s, v3.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "sri v6.4s, v2.4s, #8\n\t" + "sri v7.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + /* Round Done */ + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_encrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "sub %x[len], %x[len], #0x40\n\t" + "cmp %x[len], #0x40\n\t" + "b.ge L_AES_ECB_encrypt_NEON_loop_4_%=\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_start_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "cmp %x[len], #16\n\t" + "b.eq L_AES_ECB_encrypt_NEON_start_1_%=\n\t" + "b.lt L_AES_ECB_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_loop_2_%=: \n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b, v1.16b}, [%x[in]], #32\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_loop_nr_2_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v10.16b, v0.16b, #1\n\t" + "shl v11.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "shl v10.4s, v4.4s, #8\n\t" + "shl v11.4s, v5.4s, #8\n\t" + "sri v10.4s, v4.4s, #24\n\t" + "sri v11.4s, v5.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* Round Done */ + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_encrypt_NEON_loop_nr_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "sub %x[len], %x[len], #32\n\t" + "cmp %x[len], #0\n\t" + "b.eq L_AES_ECB_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_start_1_%=: \n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x8], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_encrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x8], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "\n" + "L_AES_ECB_encrypt_NEON_data_done_%=: \n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x7", "x8", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || + * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */ +#ifdef HAVE_AES_CBC +void AES_CBC_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + __asm__ __volatile__ ( + "ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [%[te]], #0x40\n\t" + "ld1 {v14.16b, v15.16b, v16.16b, v17.16b}, [%[te]], #0x40\n\t" + "ld1 {v18.16b, v19.16b, v20.16b, v21.16b}, [%[te]], #0x40\n\t" + "ld1 {v22.16b, v23.16b, v24.16b, v25.16b}, [%[te]]\n\t" + "movi v6.16b, #0x40\n\t" + "movi v7.16b, #0x80\n\t" + "movi v8.16b, #0xc0\n\t" + "movi v9.16b, #27\n\t" + "ld1 {v0.2d}, [%x[iv]]\n\t" + "ld1 {v26.2d}, [%[shuffle]]\n\t" + "\n" + "L_AES_CBC_encrypt_NEON_loop_block_%=: \n\t" + "add x9, %x[ks], #16\n\t" + "ld1 {v1.16b}, [%x[in]], #16\n\t" + "ld1 {v2.16b}, [%x[ks]]\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v2.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_encrypt_NEON_loop_nr_%=: \n\t" + "eor v2.16b, v0.16b, v6.16b\n\t" + "eor v3.16b, v0.16b, v7.16b\n\t" + "eor v4.16b, v0.16b, v8.16b\n\t" + "tbl v1.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v0.16b\n\t" + "tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b\n\t" + "tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b\n\t" + "tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v3.16b, v3.16b, v4.16b\n\t" + "orr v1.16b, v1.16b, v3.16b\n\t" + "tbl v1.16b, {v1.16b}, v26.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" + "sshr v4.16b, v1.16b, #7\n\t" + "shl v3.16b, v1.16b, #1\n\t" + "and v4.16b, v4.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v3.16b\n\t" + "rev32 v2.8h, v1.8h\n\t" + "eor v5.16b, v4.16b, v1.16b\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "shl v3.4s, v1.4s, #24\n\t" + "shl v2.4s, v5.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "sri v3.4s, v1.4s, #8\n\t" + "sri v2.4s, v5.4s, #24\n\t" + "eor v1.16b, v4.16b, v3.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v2.16b, v1.16b, v6.16b\n\t" + "eor v3.16b, v1.16b, v7.16b\n\t" + "eor v4.16b, v1.16b, v8.16b\n\t" + "tbl v0.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v1.16b\n\t" + "tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b\n\t" + "tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b\n\t" + "tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b\n\t" + "orr v0.16b, v0.16b, v2.16b\n\t" + "orr v3.16b, v3.16b, v4.16b\n\t" + "orr v0.16b, v0.16b, v3.16b\n\t" + "tbl v0.16b, {v0.16b}, v26.16b\n\t" + "ld1 {v1.2d}, [x9], #16\n\t" + "sshr v4.16b, v0.16b, #7\n\t" + "shl v3.16b, v0.16b, #1\n\t" + "and v4.16b, v4.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v3.16b\n\t" + "rev32 v2.8h, v0.8h\n\t" + "eor v5.16b, v4.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "shl v3.4s, v0.4s, #24\n\t" + "shl v2.4s, v5.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v1.16b\n\t" + "sri v3.4s, v0.4s, #8\n\t" + "sri v2.4s, v5.4s, #24\n\t" + "eor v0.16b, v4.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "subs w8, w8, #2\n\t" + "b.ne L_AES_CBC_encrypt_NEON_loop_nr_%=\n\t" + "eor v2.16b, v0.16b, v6.16b\n\t" + "eor v3.16b, v0.16b, v7.16b\n\t" + "eor v4.16b, v0.16b, v8.16b\n\t" + "tbl v1.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v0.16b\n\t" + "tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b\n\t" + "tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b\n\t" + "tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v3.16b, v3.16b, v4.16b\n\t" + "orr v1.16b, v1.16b, v3.16b\n\t" + "tbl v1.16b, {v1.16b}, v26.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" + "sshr v4.16b, v1.16b, #7\n\t" + "shl v3.16b, v1.16b, #1\n\t" + "and v4.16b, v4.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v3.16b\n\t" + "rev32 v2.8h, v1.8h\n\t" + "eor v5.16b, v4.16b, v1.16b\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "shl v3.4s, v1.4s, #24\n\t" + "shl v2.4s, v5.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "sri v3.4s, v1.4s, #8\n\t" + "sri v2.4s, v5.4s, #24\n\t" + "eor v1.16b, v4.16b, v3.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v2.16b, v1.16b, v6.16b\n\t" + "eor v3.16b, v1.16b, v7.16b\n\t" + "eor v4.16b, v1.16b, v8.16b\n\t" + "tbl v0.16b, {v10.16b, v11.16b, v12.16b, v13.16b}, v1.16b\n\t" + "tbl v2.16b, {v14.16b, v15.16b, v16.16b, v17.16b}, v2.16b\n\t" + "tbl v3.16b, {v18.16b, v19.16b, v20.16b, v21.16b}, v3.16b\n\t" + "tbl v4.16b, {v22.16b, v23.16b, v24.16b, v25.16b}, v4.16b\n\t" + "orr v0.16b, v0.16b, v2.16b\n\t" + "orr v3.16b, v3.16b, v4.16b\n\t" + "orr v0.16b, v0.16b, v3.16b\n\t" + "tbl v0.16b, {v0.16b}, v26.16b\n\t" + "ld1 {v1.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v1.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "subs %x[len], %x[len], #16\n\t" + "b.ne L_AES_CBC_encrypt_NEON_loop_block_%=\n\t" + "st1 {v0.2d}, [%x[iv]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26" + ); +} + +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_CTR_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + __asm__ __volatile__ ( + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "ld1 {v2.2d}, [%x[ctr]]\n\t" + "rev64 v8.16b, v2.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "mov x10, v8.d[1]\n\t" + "mov x11, v8.d[0]\n\t" + "cmp %x[len], #0x40\n\t" + "b.lt L_AES_CTR_encrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_loop_4_%=: \n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + "mov v8.d[1], x10\n\t" + "mov v8.d[0], x11\n\t" + "rev64 v8.16b, v8.16b\n\t" + "rev32 v8.16b, v8.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v8.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v8.d[1], x10\n\t" + "mov v8.d[0], x11\n\t" + "rev64 v8.16b, v8.16b\n\t" + "rev32 v8.16b, v8.16b\n\t" + "eor v1.16b, v8.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v8.d[1], x10\n\t" + "mov v8.d[0], x11\n\t" + "rev64 v8.16b, v8.16b\n\t" + "rev32 v8.16b, v8.16b\n\t" + "eor v2.16b, v8.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v8.d[1], x10\n\t" + "mov v8.d[0], x11\n\t" + "rev64 v8.16b, v8.16b\n\t" + "rev32 v8.16b, v8.16b\n\t" + "eor v3.16b, v8.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v8.d[1], x10\n\t" + "mov v8.d[0], x11\n\t" + "rev64 v8.16b, v8.16b\n\t" + "rev32 v8.16b, v8.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "movi v4.16b, #27\n\t" + "and v8.16b, v8.16b, v4.16b\n\t" + "and v9.16b, v9.16b, v4.16b\n\t" + "and v10.16b, v10.16b, v4.16b\n\t" + "and v11.16b, v11.16b, v4.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "eor v6.16b, v10.16b, v2.16b\n\t" + "eor v7.16b, v11.16b, v3.16b\n\t" + "shl v12.4s, v4.4s, #8\n\t" + "shl v13.4s, v5.4s, #8\n\t" + "shl v14.4s, v6.4s, #8\n\t" + "shl v15.4s, v7.4s, #8\n\t" + "sri v12.4s, v4.4s, #24\n\t" + "sri v13.4s, v5.4s, #24\n\t" + "sri v14.4s, v6.4s, #24\n\t" + "sri v15.4s, v7.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "shl v6.4s, v2.4s, #24\n\t" + "shl v7.4s, v3.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "sri v6.4s, v2.4s, #8\n\t" + "sri v7.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_CTR_encrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "sub %x[len], %x[len], #0x40\n\t" + "cmp %x[len], #0x40\n\t" + "b.ge L_AES_CTR_encrypt_NEON_loop_4_%=\n\t" + "mov v2.d[1], x10\n\t" + "mov v2.d[0], x11\n\t" + "rev64 v2.16b, v2.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_start_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "cmp %x[len], #16\n\t" + "b.eq L_AES_CTR_encrypt_NEON_start_1_%=\n\t" + "b.lt L_AES_CTR_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_loop_2_%=: \n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v2.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v2.d[1], x10\n\t" + "mov v2.d[0], x11\n\t" + "rev64 v2.16b, v2.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "eor v1.16b, v2.16b, v4.16b\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v2.d[1], x10\n\t" + "mov v2.d[0], x11\n\t" + "rev64 v2.16b, v2.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_loop_nr_2_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v10.16b, v0.16b, #1\n\t" + "shl v11.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "shl v10.4s, v4.4s, #8\n\t" + "shl v11.4s, v5.4s, #8\n\t" + "sri v10.4s, v4.4s, #24\n\t" + "sri v11.4s, v5.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_CTR_encrypt_NEON_loop_nr_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "ld1 {v4.16b, v5.16b}, [%x[in]], #32\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "sub %x[len], %x[len], #32\n\t" + "cmp %x[len], #0\n\t" + "b.eq L_AES_CTR_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_start_1_%=: \n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v2.16b, v4.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w8, w8, #2\n\t" + "b.ne L_AES_CTR_encrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x9], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "ld1 {v4.16b}, [%x[in]], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "adds x10, x10, #1\n\t" + "adc x11, x11, xzr\n\t" + "mov v2.d[1], x10\n\t" + "mov v2.d[0], x11\n\t" + "rev64 v2.16b, v2.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "\n" + "L_AES_CTR_encrypt_NEON_data_done_%=: \n\t" + "rev32 v2.16b, v2.16b\n\t" + "st1 {v2.2d}, [%x[ctr]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x8", "x9", "x10", "x11", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB) +static const word8 L_AES_ARM64_NEON_td[] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +}; + +static const word8 L_AES_ARM64_NEON_shift_rows_invshuffle[] = { + 0x04, 0x09, 0x0e, 0x03, 0x08, 0x0d, 0x02, 0x07, + 0x0c, 0x01, 0x06, 0x0b, 0x00, 0x05, 0x0a, 0x0f, +}; + +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB) +void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_decrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr) +{ + const word8* td = L_AES_ARM64_NEON_td; + const word8* invshuffle = L_AES_ARM64_NEON_shift_rows_invshuffle; + __asm__ __volatile__ ( + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[td]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[td]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[td]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + "cmp %x[len], #0x40\n\t" + "b.lt L_AES_ECB_decrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_loop_4_%=: \n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "ushr v14.16b, v6.16b, #6\n\t" + "ushr v15.16b, v7.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "shl v2.16b, v6.16b, #2\n\t" + "shl v3.16b, v7.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "ushr v2.16b, v6.16b, #5\n\t" + "ushr v3.16b, v7.16b, #5\n\t" + "pmul v0.16b, v0.16b, v28.16b\n\t" + "pmul v1.16b, v1.16b, v28.16b\n\t" + "pmul v2.16b, v2.16b, v28.16b\n\t" + "pmul v3.16b, v3.16b, v28.16b\n\t" + "shl v28.16b, v4.16b, #3\n\t" + "shl v29.16b, v5.16b, #3\n\t" + "shl v30.16b, v6.16b, #3\n\t" + "shl v31.16b, v7.16b, #3\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v0.16b\n\t" + "eor v29.16b, v9.16b, v1.16b\n\t" + "eor v30.16b, v10.16b, v2.16b\n\t" + "eor v31.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v10.16b, v14.16b, v2.16b\n\t" + "eor v11.16b, v15.16b, v3.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v4.16b\n\t" + "eor v29.16b, v29.16b, v5.16b\n\t" + "eor v30.16b, v30.16b, v6.16b\n\t" + "eor v31.16b, v31.16b, v7.16b\n\t" + "shl v4.4s, v28.4s, #8\n\t" + "shl v5.4s, v29.4s, #8\n\t" + "shl v6.4s, v30.4s, #8\n\t" + "shl v7.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v4.4s, v28.4s, #24\n\t" + "sri v5.4s, v29.4s, #24\n\t" + "sri v6.4s, v30.4s, #24\n\t" + "sri v7.4s, v31.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + "shl v28.4s, v0.4s, #24\n\t" + "shl v29.4s, v1.4s, #24\n\t" + "shl v30.4s, v2.4s, #24\n\t" + "shl v31.4s, v3.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "sri v28.4s, v0.4s, #8\n\t" + "sri v29.4s, v1.4s, #8\n\t" + "sri v30.4s, v2.4s, #8\n\t" + "sri v31.4s, v3.4s, #8\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v0.16b, #6\n\t" + "ushr v13.16b, v1.16b, #6\n\t" + "ushr v14.16b, v2.16b, #6\n\t" + "ushr v15.16b, v3.16b, #6\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "shl v5.16b, v1.16b, #2\n\t" + "shl v6.16b, v2.16b, #2\n\t" + "shl v7.16b, v3.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "eor v14.16b, v14.16b, v6.16b\n\t" + "eor v15.16b, v15.16b, v7.16b\n\t" + "ushr v4.16b, v0.16b, #5\n\t" + "ushr v5.16b, v1.16b, #5\n\t" + "ushr v6.16b, v2.16b, #5\n\t" + "ushr v7.16b, v3.16b, #5\n\t" + "pmul v4.16b, v4.16b, v28.16b\n\t" + "pmul v5.16b, v5.16b, v28.16b\n\t" + "pmul v6.16b, v6.16b, v28.16b\n\t" + "pmul v7.16b, v7.16b, v28.16b\n\t" + "shl v28.16b, v0.16b, #3\n\t" + "shl v29.16b, v1.16b, #3\n\t" + "shl v30.16b, v2.16b, #3\n\t" + "shl v31.16b, v3.16b, #3\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v4.16b\n\t" + "eor v29.16b, v9.16b, v5.16b\n\t" + "eor v30.16b, v10.16b, v6.16b\n\t" + "eor v31.16b, v11.16b, v7.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + "eor v8.16b, v12.16b, v4.16b\n\t" + "eor v9.16b, v13.16b, v5.16b\n\t" + "eor v10.16b, v14.16b, v6.16b\n\t" + "eor v11.16b, v15.16b, v7.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v0.16b\n\t" + "eor v29.16b, v29.16b, v1.16b\n\t" + "eor v30.16b, v30.16b, v2.16b\n\t" + "eor v31.16b, v31.16b, v3.16b\n\t" + "shl v0.4s, v28.4s, #8\n\t" + "shl v1.4s, v29.4s, #8\n\t" + "shl v2.4s, v30.4s, #8\n\t" + "shl v3.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v0.4s, v28.4s, #24\n\t" + "sri v1.4s, v29.4s, #24\n\t" + "sri v2.4s, v30.4s, #24\n\t" + "sri v3.4s, v31.4s, #24\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + "shl v28.4s, v4.4s, #24\n\t" + "shl v29.4s, v5.4s, #24\n\t" + "shl v30.4s, v6.4s, #24\n\t" + "shl v31.4s, v7.4s, #24\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "sri v28.4s, v4.4s, #8\n\t" + "sri v29.4s, v5.4s, #8\n\t" + "sri v30.4s, v6.4s, #8\n\t" + "sri v31.4s, v7.4s, #8\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_decrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "ushr v14.16b, v6.16b, #6\n\t" + "ushr v15.16b, v7.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "shl v2.16b, v6.16b, #2\n\t" + "shl v3.16b, v7.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "ushr v2.16b, v6.16b, #5\n\t" + "ushr v3.16b, v7.16b, #5\n\t" + "pmul v0.16b, v0.16b, v28.16b\n\t" + "pmul v1.16b, v1.16b, v28.16b\n\t" + "pmul v2.16b, v2.16b, v28.16b\n\t" + "pmul v3.16b, v3.16b, v28.16b\n\t" + "shl v28.16b, v4.16b, #3\n\t" + "shl v29.16b, v5.16b, #3\n\t" + "shl v30.16b, v6.16b, #3\n\t" + "shl v31.16b, v7.16b, #3\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v0.16b\n\t" + "eor v29.16b, v9.16b, v1.16b\n\t" + "eor v30.16b, v10.16b, v2.16b\n\t" + "eor v31.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v10.16b, v14.16b, v2.16b\n\t" + "eor v11.16b, v15.16b, v3.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v4.16b\n\t" + "eor v29.16b, v29.16b, v5.16b\n\t" + "eor v30.16b, v30.16b, v6.16b\n\t" + "eor v31.16b, v31.16b, v7.16b\n\t" + "shl v4.4s, v28.4s, #8\n\t" + "shl v5.4s, v29.4s, #8\n\t" + "shl v6.4s, v30.4s, #8\n\t" + "shl v7.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v4.4s, v28.4s, #24\n\t" + "sri v5.4s, v29.4s, #24\n\t" + "sri v6.4s, v30.4s, #24\n\t" + "sri v7.4s, v31.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + "shl v28.4s, v0.4s, #24\n\t" + "shl v29.4s, v1.4s, #24\n\t" + "shl v30.4s, v2.4s, #24\n\t" + "shl v31.4s, v3.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "sri v28.4s, v0.4s, #8\n\t" + "sri v29.4s, v1.4s, #8\n\t" + "sri v30.4s, v2.4s, #8\n\t" + "sri v31.4s, v3.4s, #8\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "sub %x[len], %x[len], #0x40\n\t" + "cmp %x[len], #0x40\n\t" + "b.ge L_AES_ECB_decrypt_NEON_loop_4_%=\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_start_2_%=: \n\t" + "cmp %x[len], #16\n\t" + "b.eq L_AES_ECB_decrypt_NEON_start_1_%=\n\t" + "b.lt L_AES_ECB_decrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_loop_2_%=: \n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b, v1.16b}, [%x[in]], #32\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_loop_nr_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "pmul v0.16b, v0.16b, v10.16b\n\t" + "pmul v1.16b, v1.16b, v10.16b\n\t" + "shl v10.16b, v4.16b, #3\n\t" + "shl v11.16b, v5.16b, #3\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v0.16b\n\t" + "eor v11.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v5.16b\n\t" + "shl v4.4s, v10.4s, #8\n\t" + "shl v5.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v4.4s, v10.4s, #24\n\t" + "sri v5.4s, v11.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "shl v10.4s, v0.4s, #24\n\t" + "shl v11.4s, v1.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "sri v10.4s, v0.4s, #8\n\t" + "sri v11.4s, v1.4s, #8\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v0.16b, #6\n\t" + "ushr v13.16b, v1.16b, #6\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "shl v5.16b, v1.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "ushr v4.16b, v0.16b, #5\n\t" + "ushr v5.16b, v1.16b, #5\n\t" + "pmul v4.16b, v4.16b, v10.16b\n\t" + "pmul v5.16b, v5.16b, v10.16b\n\t" + "shl v10.16b, v0.16b, #3\n\t" + "shl v11.16b, v1.16b, #3\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v4.16b\n\t" + "eor v11.16b, v9.16b, v5.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v8.16b, v12.16b, v4.16b\n\t" + "eor v9.16b, v13.16b, v5.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "eor v11.16b, v11.16b, v1.16b\n\t" + "shl v0.4s, v10.4s, #8\n\t" + "shl v1.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v0.4s, v10.4s, #24\n\t" + "sri v1.4s, v11.4s, #24\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "shl v10.4s, v4.4s, #24\n\t" + "shl v11.4s, v5.4s, #24\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "sri v10.4s, v4.4s, #8\n\t" + "sri v11.4s, v5.4s, #8\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_decrypt_NEON_loop_nr_2_%=\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "pmul v0.16b, v0.16b, v10.16b\n\t" + "pmul v1.16b, v1.16b, v10.16b\n\t" + "shl v10.16b, v4.16b, #3\n\t" + "shl v11.16b, v5.16b, #3\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v0.16b\n\t" + "eor v11.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v5.16b\n\t" + "shl v4.4s, v10.4s, #8\n\t" + "shl v5.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v4.4s, v10.4s, #24\n\t" + "sri v5.4s, v11.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "shl v10.4s, v0.4s, #24\n\t" + "shl v11.4s, v1.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "sri v10.4s, v0.4s, #8\n\t" + "sri v11.4s, v1.4s, #8\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x8], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x8], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "sub %x[len], %x[len], #32\n\t" + "cmp %x[len], #0\n\t" + "b.eq L_AES_ECB_decrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_start_1_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "ld1 {v3.2d}, [%[invshuffle]]\n\t" + "mov x8, %x[ks]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + "rev32 v0.16b, v0.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w7, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x8], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w7, w7, #2\n\t" + "b.ne L_AES_ECB_decrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x8], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x8], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "\n" + "L_AES_ECB_decrypt_NEON_data_done_%=: \n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr) + : [in] "r" (in), [ks] "r" (ks), [td] "r" (td), + [invshuffle] "r" (invshuffle) + : "memory", "cc", "x7", "x8", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */ +#ifdef HAVE_AES_CBC +void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_decrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +{ + const word8* td = L_AES_ARM64_NEON_td; + const word8* invshuffle = L_AES_ARM64_NEON_shift_rows_invshuffle; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-96]!\n\t" + "add x29, sp, #0\n\t" + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[td]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[td]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[td]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + "ld1 {v3.2d}, [%x[iv]]\n\t" + "add x10, x29, #16\n\t" + "cmp %x[len], #0x40\n\t" + "b.lt L_AES_CBC_decrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_loop_4_%=: \n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t" + "st1 {v3.2d, v4.2d, v5.2d, v6.2d}, [x10]\n\t" + "str q7, [x10, #64]\n\t" + "ld1 {v8.2d}, [x9], #16\n\t" + "rev32 v4.16b, v4.16b\n\t" + "rev32 v5.16b, v5.16b\n\t" + "rev32 v6.16b, v6.16b\n\t" + "rev32 v7.16b, v7.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "eor v6.16b, v6.16b, v8.16b\n\t" + "eor v7.16b, v7.16b, v8.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v10.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v11.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v5.16b, v12.16b\n\t" + "eor v2.16b, v6.16b, v12.16b\n\t" + "eor v3.16b, v7.16b, v12.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b\n\t" + "tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v4.16b, v13.16b\n\t" + "eor v1.16b, v5.16b, v13.16b\n\t" + "eor v2.16b, v6.16b, v13.16b\n\t" + "eor v3.16b, v7.16b, v13.16b\n\t" + "tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v4.16b, v14.16b\n\t" + "eor v1.16b, v5.16b, v14.16b\n\t" + "eor v2.16b, v6.16b, v14.16b\n\t" + "eor v3.16b, v7.16b, v14.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v8.16b, {v8.16b}, v4.16b\n\t" + "tbl v9.16b, {v9.16b}, v4.16b\n\t" + "tbl v10.16b, {v10.16b}, v4.16b\n\t" + "tbl v11.16b, {v11.16b}, v4.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v0.16b, v8.16b, #7\n\t" + "sshr v1.16b, v9.16b, #7\n\t" + "sshr v2.16b, v10.16b, #7\n\t" + "sshr v3.16b, v11.16b, #7\n\t" + "shl v12.16b, v8.16b, #1\n\t" + "shl v13.16b, v9.16b, #1\n\t" + "shl v14.16b, v10.16b, #1\n\t" + "shl v15.16b, v11.16b, #1\n\t" + "and v0.16b, v0.16b, v28.16b\n\t" + "and v1.16b, v1.16b, v28.16b\n\t" + "and v2.16b, v2.16b, v28.16b\n\t" + "and v3.16b, v3.16b, v28.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + "ushr v12.16b, v8.16b, #6\n\t" + "ushr v13.16b, v9.16b, #6\n\t" + "ushr v14.16b, v10.16b, #6\n\t" + "ushr v15.16b, v11.16b, #6\n\t" + "shl v4.16b, v8.16b, #2\n\t" + "shl v5.16b, v9.16b, #2\n\t" + "shl v6.16b, v10.16b, #2\n\t" + "shl v7.16b, v11.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "eor v14.16b, v14.16b, v6.16b\n\t" + "eor v15.16b, v15.16b, v7.16b\n\t" + "ushr v4.16b, v8.16b, #5\n\t" + "ushr v5.16b, v9.16b, #5\n\t" + "ushr v6.16b, v10.16b, #5\n\t" + "ushr v7.16b, v11.16b, #5\n\t" + "pmul v4.16b, v4.16b, v28.16b\n\t" + "pmul v5.16b, v5.16b, v28.16b\n\t" + "pmul v6.16b, v6.16b, v28.16b\n\t" + "pmul v7.16b, v7.16b, v28.16b\n\t" + "shl v28.16b, v8.16b, #3\n\t" + "shl v29.16b, v9.16b, #3\n\t" + "shl v30.16b, v10.16b, #3\n\t" + "shl v31.16b, v11.16b, #3\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "eor v28.16b, v0.16b, v4.16b\n\t" + "eor v29.16b, v1.16b, v5.16b\n\t" + "eor v30.16b, v2.16b, v6.16b\n\t" + "eor v31.16b, v3.16b, v7.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v0.16b, v12.16b, v4.16b\n\t" + "eor v1.16b, v13.16b, v5.16b\n\t" + "eor v2.16b, v14.16b, v6.16b\n\t" + "eor v3.16b, v15.16b, v7.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v8.16b\n\t" + "eor v29.16b, v29.16b, v9.16b\n\t" + "eor v30.16b, v30.16b, v10.16b\n\t" + "eor v31.16b, v31.16b, v11.16b\n\t" + "shl v8.4s, v28.4s, #8\n\t" + "shl v9.4s, v29.4s, #8\n\t" + "shl v10.4s, v30.4s, #8\n\t" + "shl v11.4s, v31.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "sri v8.4s, v28.4s, #24\n\t" + "sri v9.4s, v29.4s, #24\n\t" + "sri v10.4s, v30.4s, #24\n\t" + "sri v11.4s, v31.4s, #24\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "shl v28.4s, v4.4s, #24\n\t" + "shl v29.4s, v5.4s, #24\n\t" + "shl v30.4s, v6.4s, #24\n\t" + "shl v31.4s, v7.4s, #24\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v9.16b, v9.16b, v1.16b\n\t" + "eor v10.16b, v10.16b, v2.16b\n\t" + "eor v11.16b, v11.16b, v3.16b\n\t" + "sri v28.4s, v4.4s, #8\n\t" + "sri v29.4s, v5.4s, #8\n\t" + "sri v30.4s, v6.4s, #8\n\t" + "sri v31.4s, v7.4s, #8\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor v10.16b, v10.16b, v30.16b\n\t" + "eor v11.16b, v11.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v4.16b\n\t" + /* Round Done */ + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v10.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v11.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v9.16b, v12.16b\n\t" + "eor v2.16b, v10.16b, v12.16b\n\t" + "eor v3.16b, v11.16b, v12.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b\n\t" + "tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "eor v0.16b, v8.16b, v13.16b\n\t" + "eor v1.16b, v9.16b, v13.16b\n\t" + "eor v2.16b, v10.16b, v13.16b\n\t" + "eor v3.16b, v11.16b, v13.16b\n\t" + "tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "eor v0.16b, v8.16b, v14.16b\n\t" + "eor v1.16b, v9.16b, v14.16b\n\t" + "eor v2.16b, v10.16b, v14.16b\n\t" + "eor v3.16b, v11.16b, v14.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "ld1 {v8.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v8.16b\n\t" + "tbl v5.16b, {v5.16b}, v8.16b\n\t" + "tbl v6.16b, {v6.16b}, v8.16b\n\t" + "tbl v7.16b, {v7.16b}, v8.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v0.16b, v4.16b, #7\n\t" + "sshr v1.16b, v5.16b, #7\n\t" + "sshr v2.16b, v6.16b, #7\n\t" + "sshr v3.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "and v0.16b, v0.16b, v28.16b\n\t" + "and v1.16b, v1.16b, v28.16b\n\t" + "and v2.16b, v2.16b, v28.16b\n\t" + "and v3.16b, v3.16b, v28.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "ushr v14.16b, v6.16b, #6\n\t" + "ushr v15.16b, v7.16b, #6\n\t" + "shl v8.16b, v4.16b, #2\n\t" + "shl v9.16b, v5.16b, #2\n\t" + "shl v10.16b, v6.16b, #2\n\t" + "shl v11.16b, v7.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v8.16b\n\t" + "eor v13.16b, v13.16b, v9.16b\n\t" + "eor v14.16b, v14.16b, v10.16b\n\t" + "eor v15.16b, v15.16b, v11.16b\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "ushr v9.16b, v5.16b, #5\n\t" + "ushr v10.16b, v6.16b, #5\n\t" + "ushr v11.16b, v7.16b, #5\n\t" + "pmul v8.16b, v8.16b, v28.16b\n\t" + "pmul v9.16b, v9.16b, v28.16b\n\t" + "pmul v10.16b, v10.16b, v28.16b\n\t" + "pmul v11.16b, v11.16b, v28.16b\n\t" + "shl v28.16b, v4.16b, #3\n\t" + "shl v29.16b, v5.16b, #3\n\t" + "shl v30.16b, v6.16b, #3\n\t" + "shl v31.16b, v7.16b, #3\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor v10.16b, v10.16b, v30.16b\n\t" + "eor v11.16b, v11.16b, v31.16b\n\t" + "eor v28.16b, v0.16b, v8.16b\n\t" + "eor v29.16b, v1.16b, v9.16b\n\t" + "eor v30.16b, v2.16b, v10.16b\n\t" + "eor v31.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v5.16b\n\t" + "eor v10.16b, v10.16b, v6.16b\n\t" + "eor v11.16b, v11.16b, v7.16b\n\t" + "eor v0.16b, v12.16b, v8.16b\n\t" + "eor v1.16b, v13.16b, v9.16b\n\t" + "eor v2.16b, v14.16b, v10.16b\n\t" + "eor v3.16b, v15.16b, v11.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v4.16b\n\t" + "eor v29.16b, v29.16b, v5.16b\n\t" + "eor v30.16b, v30.16b, v6.16b\n\t" + "eor v31.16b, v31.16b, v7.16b\n\t" + "shl v4.4s, v28.4s, #8\n\t" + "shl v5.4s, v29.4s, #8\n\t" + "shl v6.4s, v30.4s, #8\n\t" + "shl v7.4s, v31.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "sri v4.4s, v28.4s, #24\n\t" + "sri v5.4s, v29.4s, #24\n\t" + "sri v6.4s, v30.4s, #24\n\t" + "sri v7.4s, v31.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + "shl v28.4s, v8.4s, #24\n\t" + "shl v29.4s, v9.4s, #24\n\t" + "shl v30.4s, v10.4s, #24\n\t" + "shl v31.4s, v11.4s, #24\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + "sri v28.4s, v8.4s, #8\n\t" + "sri v29.4s, v9.4s, #8\n\t" + "sri v30.4s, v10.4s, #8\n\t" + "sri v31.4s, v11.4s, #8\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v8.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "eor v6.16b, v6.16b, v8.16b\n\t" + "eor v7.16b, v7.16b, v8.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_CBC_decrypt_NEON_loop_nr_4_%=\n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v10.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v11.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v5.16b, v12.16b\n\t" + "eor v2.16b, v6.16b, v12.16b\n\t" + "eor v3.16b, v7.16b, v12.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b\n\t" + "tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v4.16b, v13.16b\n\t" + "eor v1.16b, v5.16b, v13.16b\n\t" + "eor v2.16b, v6.16b, v13.16b\n\t" + "eor v3.16b, v7.16b, v13.16b\n\t" + "tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v4.16b, v14.16b\n\t" + "eor v1.16b, v5.16b, v14.16b\n\t" + "eor v2.16b, v6.16b, v14.16b\n\t" + "eor v3.16b, v7.16b, v14.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "orr v10.16b, v10.16b, v2.16b\n\t" + "orr v11.16b, v11.16b, v3.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v8.16b, {v8.16b}, v4.16b\n\t" + "tbl v9.16b, {v9.16b}, v4.16b\n\t" + "tbl v10.16b, {v10.16b}, v4.16b\n\t" + "tbl v11.16b, {v11.16b}, v4.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v0.16b, v8.16b, #7\n\t" + "sshr v1.16b, v9.16b, #7\n\t" + "sshr v2.16b, v10.16b, #7\n\t" + "sshr v3.16b, v11.16b, #7\n\t" + "shl v12.16b, v8.16b, #1\n\t" + "shl v13.16b, v9.16b, #1\n\t" + "shl v14.16b, v10.16b, #1\n\t" + "shl v15.16b, v11.16b, #1\n\t" + "and v0.16b, v0.16b, v28.16b\n\t" + "and v1.16b, v1.16b, v28.16b\n\t" + "and v2.16b, v2.16b, v28.16b\n\t" + "and v3.16b, v3.16b, v28.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + "ushr v12.16b, v8.16b, #6\n\t" + "ushr v13.16b, v9.16b, #6\n\t" + "ushr v14.16b, v10.16b, #6\n\t" + "ushr v15.16b, v11.16b, #6\n\t" + "shl v4.16b, v8.16b, #2\n\t" + "shl v5.16b, v9.16b, #2\n\t" + "shl v6.16b, v10.16b, #2\n\t" + "shl v7.16b, v11.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "eor v14.16b, v14.16b, v6.16b\n\t" + "eor v15.16b, v15.16b, v7.16b\n\t" + "ushr v4.16b, v8.16b, #5\n\t" + "ushr v5.16b, v9.16b, #5\n\t" + "ushr v6.16b, v10.16b, #5\n\t" + "ushr v7.16b, v11.16b, #5\n\t" + "pmul v4.16b, v4.16b, v28.16b\n\t" + "pmul v5.16b, v5.16b, v28.16b\n\t" + "pmul v6.16b, v6.16b, v28.16b\n\t" + "pmul v7.16b, v7.16b, v28.16b\n\t" + "shl v28.16b, v8.16b, #3\n\t" + "shl v29.16b, v9.16b, #3\n\t" + "shl v30.16b, v10.16b, #3\n\t" + "shl v31.16b, v11.16b, #3\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "eor v28.16b, v0.16b, v4.16b\n\t" + "eor v29.16b, v1.16b, v5.16b\n\t" + "eor v30.16b, v2.16b, v6.16b\n\t" + "eor v31.16b, v3.16b, v7.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v0.16b, v12.16b, v4.16b\n\t" + "eor v1.16b, v13.16b, v5.16b\n\t" + "eor v2.16b, v14.16b, v6.16b\n\t" + "eor v3.16b, v15.16b, v7.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v8.16b\n\t" + "eor v29.16b, v29.16b, v9.16b\n\t" + "eor v30.16b, v30.16b, v10.16b\n\t" + "eor v31.16b, v31.16b, v11.16b\n\t" + "shl v8.4s, v28.4s, #8\n\t" + "shl v9.4s, v29.4s, #8\n\t" + "shl v10.4s, v30.4s, #8\n\t" + "shl v11.4s, v31.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "sri v8.4s, v28.4s, #24\n\t" + "sri v9.4s, v29.4s, #24\n\t" + "sri v10.4s, v30.4s, #24\n\t" + "sri v11.4s, v31.4s, #24\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "shl v28.4s, v4.4s, #24\n\t" + "shl v29.4s, v5.4s, #24\n\t" + "shl v30.4s, v6.4s, #24\n\t" + "shl v31.4s, v7.4s, #24\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v9.16b, v9.16b, v1.16b\n\t" + "eor v10.16b, v10.16b, v2.16b\n\t" + "eor v11.16b, v11.16b, v3.16b\n\t" + "sri v28.4s, v4.4s, #8\n\t" + "sri v29.4s, v5.4s, #8\n\t" + "sri v30.4s, v6.4s, #8\n\t" + "sri v31.4s, v7.4s, #8\n\t" + "eor v8.16b, v8.16b, v28.16b\n\t" + "eor v9.16b, v9.16b, v29.16b\n\t" + "eor v10.16b, v10.16b, v30.16b\n\t" + "eor v11.16b, v11.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v4.16b\n\t" + /* Round Done */ + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v10.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v11.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v9.16b, v12.16b\n\t" + "eor v2.16b, v10.16b, v12.16b\n\t" + "eor v3.16b, v11.16b, v12.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "tbl v2.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v2.16b\n\t" + "tbl v3.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "eor v0.16b, v8.16b, v13.16b\n\t" + "eor v1.16b, v9.16b, v13.16b\n\t" + "eor v2.16b, v10.16b, v13.16b\n\t" + "eor v3.16b, v11.16b, v13.16b\n\t" + "tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "eor v0.16b, v8.16b, v14.16b\n\t" + "eor v1.16b, v9.16b, v14.16b\n\t" + "eor v2.16b, v10.16b, v14.16b\n\t" + "eor v3.16b, v11.16b, v14.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "tbl v3.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "orr v6.16b, v6.16b, v2.16b\n\t" + "orr v7.16b, v7.16b, v3.16b\n\t" + "ld1 {v8.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v8.16b\n\t" + "tbl v5.16b, {v5.16b}, v8.16b\n\t" + "tbl v6.16b, {v6.16b}, v8.16b\n\t" + "tbl v7.16b, {v7.16b}, v8.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v8.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "eor v6.16b, v6.16b, v8.16b\n\t" + "eor v7.16b, v7.16b, v8.16b\n\t" + /* Round Done */ + "rev32 v4.16b, v4.16b\n\t" + "rev32 v5.16b, v5.16b\n\t" + "rev32 v6.16b, v6.16b\n\t" + "rev32 v7.16b, v7.16b\n\t" + "ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x10]\n\t" + "ldr q3, [x10, #64]\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[out]], #0x40\n\t" + "sub %x[len], %x[len], #0x40\n\t" + "cmp %x[len], #0x40\n\t" + "b.ge L_AES_CBC_decrypt_NEON_loop_4_%=\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_start_2_%=: \n\t" + "cmp %x[len], #16\n\t" + "b.eq L_AES_CBC_decrypt_NEON_start_1_%=\n\t" + "b.lt L_AES_CBC_decrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_loop_2_%=: \n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.16b, v5.16b}, [%x[in]], #32\n\t" + "st1 {v3.2d, v4.2d, v5.2d}, [x10]\n\t" + "ld1 {v8.2d}, [x9], #16\n\t" + "rev32 v4.16b, v4.16b\n\t" + "rev32 v5.16b, v5.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_loop_nr_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v5.16b, v12.16b\n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "eor v2.16b, v4.16b, v13.16b\n\t" + "eor v3.16b, v5.16b, v13.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v4.16b, v14.16b\n\t" + "eor v1.16b, v5.16b, v14.16b\n\t" + "orr v8.16b, v8.16b, v2.16b\n\t" + "orr v9.16b, v9.16b, v3.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v8.16b, {v8.16b}, v4.16b\n\t" + "tbl v9.16b, {v9.16b}, v4.16b\n\t" + "movi v2.16b, #27\n\t" + "sshr v0.16b, v8.16b, #7\n\t" + "sshr v1.16b, v9.16b, #7\n\t" + "shl v12.16b, v8.16b, #1\n\t" + "shl v13.16b, v9.16b, #1\n\t" + "and v0.16b, v0.16b, v2.16b\n\t" + "and v1.16b, v1.16b, v2.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "ushr v12.16b, v8.16b, #6\n\t" + "ushr v13.16b, v9.16b, #6\n\t" + "shl v4.16b, v8.16b, #2\n\t" + "shl v5.16b, v9.16b, #2\n\t" + "pmul v12.16b, v12.16b, v2.16b\n\t" + "pmul v13.16b, v13.16b, v2.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "ushr v4.16b, v8.16b, #5\n\t" + "ushr v5.16b, v9.16b, #5\n\t" + "pmul v4.16b, v4.16b, v2.16b\n\t" + "pmul v5.16b, v5.16b, v2.16b\n\t" + "shl v2.16b, v8.16b, #3\n\t" + "shl v3.16b, v9.16b, #3\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v2.16b, v0.16b, v4.16b\n\t" + "eor v3.16b, v1.16b, v5.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v0.16b, v12.16b, v4.16b\n\t" + "eor v1.16b, v13.16b, v5.16b\n\t" + "eor v12.16b, v12.16b, v2.16b\n\t" + "eor v13.16b, v13.16b, v3.16b\n\t" + "eor v2.16b, v2.16b, v8.16b\n\t" + "eor v3.16b, v3.16b, v9.16b\n\t" + "shl v8.4s, v2.4s, #8\n\t" + "shl v9.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "sri v8.4s, v2.4s, #24\n\t" + "sri v9.4s, v3.4s, #24\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "shl v2.4s, v4.4s, #24\n\t" + "shl v3.4s, v5.4s, #24\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v9.16b, v9.16b, v1.16b\n\t" + "sri v2.4s, v4.4s, #8\n\t" + "sri v3.4s, v5.4s, #8\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "eor v9.16b, v9.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v9.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "eor v2.16b, v8.16b, v13.16b\n\t" + "eor v3.16b, v9.16b, v13.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "eor v0.16b, v8.16b, v14.16b\n\t" + "eor v1.16b, v9.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v2.16b\n\t" + "orr v5.16b, v5.16b, v3.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "ld1 {v8.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v8.16b\n\t" + "tbl v5.16b, {v5.16b}, v8.16b\n\t" + "movi v2.16b, #27\n\t" + "sshr v0.16b, v4.16b, #7\n\t" + "sshr v1.16b, v5.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "and v0.16b, v0.16b, v2.16b\n\t" + "and v1.16b, v1.16b, v2.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "shl v8.16b, v4.16b, #2\n\t" + "shl v9.16b, v5.16b, #2\n\t" + "pmul v12.16b, v12.16b, v2.16b\n\t" + "pmul v13.16b, v13.16b, v2.16b\n\t" + "eor v12.16b, v12.16b, v8.16b\n\t" + "eor v13.16b, v13.16b, v9.16b\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "ushr v9.16b, v5.16b, #5\n\t" + "pmul v8.16b, v8.16b, v2.16b\n\t" + "pmul v9.16b, v9.16b, v2.16b\n\t" + "shl v2.16b, v4.16b, #3\n\t" + "shl v3.16b, v5.16b, #3\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "eor v9.16b, v9.16b, v3.16b\n\t" + "eor v2.16b, v0.16b, v8.16b\n\t" + "eor v3.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v5.16b\n\t" + "eor v0.16b, v12.16b, v8.16b\n\t" + "eor v1.16b, v13.16b, v9.16b\n\t" + "eor v12.16b, v12.16b, v2.16b\n\t" + "eor v13.16b, v13.16b, v3.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v5.16b\n\t" + "shl v4.4s, v2.4s, #8\n\t" + "shl v5.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "sri v4.4s, v2.4s, #24\n\t" + "sri v5.4s, v3.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "shl v2.4s, v8.4s, #24\n\t" + "shl v3.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "sri v2.4s, v8.4s, #8\n\t" + "sri v3.4s, v9.4s, #8\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v8.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + /* Round Done */ + "subs w8, w8, #2\n\t" + "b.ne L_AES_CBC_decrypt_NEON_loop_nr_2_%=\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v5.16b, v12.16b\n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v9.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "eor v2.16b, v4.16b, v13.16b\n\t" + "eor v3.16b, v5.16b, v13.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v4.16b, v14.16b\n\t" + "eor v1.16b, v5.16b, v14.16b\n\t" + "orr v8.16b, v8.16b, v2.16b\n\t" + "orr v9.16b, v9.16b, v3.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v9.16b, v9.16b, v1.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v8.16b, {v8.16b}, v4.16b\n\t" + "tbl v9.16b, {v9.16b}, v4.16b\n\t" + "movi v2.16b, #27\n\t" + "sshr v0.16b, v8.16b, #7\n\t" + "sshr v1.16b, v9.16b, #7\n\t" + "shl v12.16b, v8.16b, #1\n\t" + "shl v13.16b, v9.16b, #1\n\t" + "and v0.16b, v0.16b, v2.16b\n\t" + "and v1.16b, v1.16b, v2.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "ushr v12.16b, v8.16b, #6\n\t" + "ushr v13.16b, v9.16b, #6\n\t" + "shl v4.16b, v8.16b, #2\n\t" + "shl v5.16b, v9.16b, #2\n\t" + "pmul v12.16b, v12.16b, v2.16b\n\t" + "pmul v13.16b, v13.16b, v2.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "ushr v4.16b, v8.16b, #5\n\t" + "ushr v5.16b, v9.16b, #5\n\t" + "pmul v4.16b, v4.16b, v2.16b\n\t" + "pmul v5.16b, v5.16b, v2.16b\n\t" + "shl v2.16b, v8.16b, #3\n\t" + "shl v3.16b, v9.16b, #3\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v2.16b, v0.16b, v4.16b\n\t" + "eor v3.16b, v1.16b, v5.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v0.16b, v12.16b, v4.16b\n\t" + "eor v1.16b, v13.16b, v5.16b\n\t" + "eor v12.16b, v12.16b, v2.16b\n\t" + "eor v13.16b, v13.16b, v3.16b\n\t" + "eor v2.16b, v2.16b, v8.16b\n\t" + "eor v3.16b, v3.16b, v9.16b\n\t" + "shl v8.4s, v2.4s, #8\n\t" + "shl v9.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "sri v8.4s, v2.4s, #24\n\t" + "sri v9.4s, v3.4s, #24\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "shl v2.4s, v4.4s, #24\n\t" + "shl v3.4s, v5.4s, #24\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v9.16b, v9.16b, v1.16b\n\t" + "sri v2.4s, v4.4s, #8\n\t" + "sri v3.4s, v5.4s, #8\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "eor v9.16b, v9.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x9], #16\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v9.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v9.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v1.16b\n\t" + "eor v2.16b, v8.16b, v13.16b\n\t" + "eor v3.16b, v9.16b, v13.16b\n\t" + "tbl v2.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v2.16b\n\t" + "tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "eor v0.16b, v8.16b, v14.16b\n\t" + "eor v1.16b, v9.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v2.16b\n\t" + "orr v5.16b, v5.16b, v3.16b\n\t" + "tbl v0.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v0.16b\n\t" + "tbl v1.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v1.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v5.16b, v5.16b, v1.16b\n\t" + "ld1 {v8.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v8.16b\n\t" + "tbl v5.16b, {v5.16b}, v8.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v8.2d}, [x9], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + /* Round Done */ + "rev32 v4.16b, v4.16b\n\t" + "rev32 v5.16b, v5.16b\n\t" + "ld1 {v1.16b, v2.16b, v3.16b}, [x10]\n\t" + "eor v4.16b, v4.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v2.16b\n\t" + "st1 {v4.16b, v5.16b}, [%x[out]], #32\n\t" + "sub %x[len], %x[len], #32\n\t" + "cmp %x[len], #32\n\t" + "b.ge L_AES_CBC_decrypt_NEON_loop_2_%=\n\t" + "cmp %x[len], #0\n\t" + "b.eq L_AES_CBC_decrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_start_1_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "ld1 {v7.2d}, [%[invshuffle]]\n\t" + "mov x9, %x[ks]\n\t" + "ld1 {v4.16b}, [%x[in]], #16\n\t" + "mov v10.16b, v3.16b\n\t" + "mov v11.16b, v4.16b\n\t" + "ld1 {v8.16b}, [x9], #16\n\t" + "rev32 v4.16b, v4.16b\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v4.16b, v4.16b, v8.16b\n\t" + "sub w8, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_loop_nr_1_%=: \n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v4.16b, v13.16b\n\t" + "eor v2.16b, v4.16b, v14.16b\n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v8.16b, v8.16b, v1.16b\n\t" + "tbl v8.16b, {v8.16b}, v7.16b\n\t" + "sshr v2.16b, v8.16b, #7\n\t" + "ushr v3.16b, v8.16b, #6\n\t" + "ushr v0.16b, v8.16b, #5\n\t" + "and v2.16b, v2.16b, v15.16b\n\t" + "pmul v3.16b, v3.16b, v15.16b\n\t" + "pmul v0.16b, v0.16b, v15.16b\n\t" + "shl v1.16b, v8.16b, #1\n\t" + "eor v2.16b, v2.16b, v1.16b\n\t" + "shl v1.16b, v8.16b, #3\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "shl v1.16b, v8.16b, #2\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v2.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v2.16b, v3.16b, v0.16b\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v1.16b, v8.16b\n\t" + "shl v8.4s, v1.4s, #8\n\t" + "rev32 v2.8h, v2.8h\n\t" + "sri v8.4s, v1.4s, #24\n\t" + "eor v8.16b, v8.16b, v3.16b\n\t" + "shl v1.4s, v0.4s, #24\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "sri v1.4s, v0.4s, #8\n\t" + "eor v8.16b, v8.16b, v1.16b\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v8.16b, v13.16b\n\t" + "eor v2.16b, v8.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v4.16b, v4.16b, v1.16b\n\t" + "tbl v4.16b, {v4.16b}, v7.16b\n\t" + "sshr v2.16b, v4.16b, #7\n\t" + "ushr v3.16b, v4.16b, #6\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "and v2.16b, v2.16b, v15.16b\n\t" + "pmul v3.16b, v3.16b, v15.16b\n\t" + "pmul v0.16b, v0.16b, v15.16b\n\t" + "shl v1.16b, v4.16b, #1\n\t" + "eor v2.16b, v2.16b, v1.16b\n\t" + "shl v1.16b, v4.16b, #3\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "shl v1.16b, v4.16b, #2\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v2.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v2.16b, v3.16b, v0.16b\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "shl v4.4s, v1.4s, #8\n\t" + "rev32 v2.8h, v2.8h\n\t" + "sri v4.4s, v1.4s, #24\n\t" + "eor v4.16b, v4.16b, v3.16b\n\t" + "shl v1.4s, v0.4s, #24\n\t" + "eor v4.16b, v4.16b, v2.16b\n\t" + "sri v1.4s, v0.4s, #8\n\t" + "eor v4.16b, v4.16b, v1.16b\n\t" + "ld1 {v8.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v8.16b\n\t" + "subs w8, w8, #2\n\t" + "b.ne L_AES_CBC_decrypt_NEON_loop_nr_1_%=\n\t" + "eor v0.16b, v4.16b, v12.16b\n\t" + "eor v1.16b, v4.16b, v13.16b\n\t" + "eor v2.16b, v4.16b, v14.16b\n\t" + "tbl v8.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "orr v8.16b, v8.16b, v0.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v8.16b, v8.16b, v1.16b\n\t" + "tbl v8.16b, {v8.16b}, v7.16b\n\t" + "sshr v2.16b, v8.16b, #7\n\t" + "ushr v3.16b, v8.16b, #6\n\t" + "ushr v0.16b, v8.16b, #5\n\t" + "and v2.16b, v2.16b, v15.16b\n\t" + "pmul v3.16b, v3.16b, v15.16b\n\t" + "pmul v0.16b, v0.16b, v15.16b\n\t" + "shl v1.16b, v8.16b, #1\n\t" + "eor v2.16b, v2.16b, v1.16b\n\t" + "shl v1.16b, v8.16b, #3\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "shl v1.16b, v8.16b, #2\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v2.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v2.16b, v3.16b, v0.16b\n\t" + "eor v3.16b, v3.16b, v1.16b\n\t" + "eor v1.16b, v1.16b, v8.16b\n\t" + "shl v8.4s, v1.4s, #8\n\t" + "rev32 v2.8h, v2.8h\n\t" + "sri v8.4s, v1.4s, #24\n\t" + "eor v8.16b, v8.16b, v3.16b\n\t" + "shl v1.4s, v0.4s, #24\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "sri v1.4s, v0.4s, #8\n\t" + "eor v8.16b, v8.16b, v1.16b\n\t" + "ld1 {v4.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v0.16b, v8.16b, v12.16b\n\t" + "eor v1.16b, v8.16b, v13.16b\n\t" + "eor v2.16b, v8.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v8.16b\n\t" + "tbl v0.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v0.16b\n\t" + "tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b\n\t" + "tbl v2.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v2.16b\n\t" + "orr v4.16b, v4.16b, v0.16b\n\t" + "orr v1.16b, v1.16b, v2.16b\n\t" + "orr v4.16b, v4.16b, v1.16b\n\t" + "tbl v4.16b, {v4.16b}, v7.16b\n\t" + "ld1 {v8.2d}, [x9], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v8.16b\n\t" + "rev32 v4.16b, v4.16b\n\t" + "mov v3.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "st1 {v4.16b}, [%x[out]], #16\n\t" + "\n" + "L_AES_CBC_decrypt_NEON_data_done_%=: \n\t" + "st1 {v3.2d}, [%x[iv]]\n\t" + "ldp x29, x30, [sp], #0x60\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv) + : [in] "r" (in), [ks] "r" (ks), [td] "r" (td), + [invshuffle] "r" (invshuffle) + : "memory", "cc", "x8", "x9", "x10", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC + * HAVE_AES_ECB */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +void GCM_gmult_len_NEON(unsigned char* x, const unsigned char* h, + const unsigned char* data, unsigned long len); +void GCM_gmult_len_NEON(unsigned char* x, const unsigned char* h, + const unsigned char* data, unsigned long len) +{ + __asm__ __volatile__ ( + "ld1 {v18.2d}, [%x[x]]\n\t" + "ld1 {v10.2d}, [%x[h]]\n\t" + "movi v19.16b, #15\n\t" + "eor v20.16b, v20.16b, v20.16b\n\t" + "rbit v18.16b, v18.16b\n\t" + "rbit v10.16b, v10.16b\n\t" + "and v12.16b, v10.16b, v19.16b\n\t" + "ushr v13.16b, v10.16b, #4\n\t" + "eor v14.16b, v12.16b, v13.16b\n\t" + "\n" + "L_GCM_gmult_len_NEON_start_block_%=: \n\t" + "ld1 {v0.16b}, [%x[data]], #16\n\t" + "rbit v0.16b, v0.16b\n\t" + "eor v18.16b, v18.16b, v0.16b\n\t" + /* Mul 128x128 */ + "and v15.16b, v18.16b, v19.16b\n\t" + "ushr v16.16b, v18.16b, #4\n\t" + "eor v17.16b, v15.16b, v16.16b\n\t" + "dup v0.16b, v12.b[0]\n\t" + "dup v2.16b, v14.b[0]\n\t" + "dup v1.16b, v13.b[0]\n\t" + "pmul v8.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "dup v0.16b, v12.b[1]\n\t" + "dup v2.16b, v14.b[1]\n\t" + "dup v1.16b, v13.b[1]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v6.16b, v20.16b, v3.16b, #15\n\t" + "ext v9.16b, v3.16b, v20.16b, #15\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[2]\n\t" + "dup v2.16b, v14.b[2]\n\t" + "dup v1.16b, v13.b[2]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #14\n\t" + "ext v6.16b, v20.16b, v3.16b, #14\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[3]\n\t" + "dup v2.16b, v14.b[3]\n\t" + "dup v1.16b, v13.b[3]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #13\n\t" + "ext v6.16b, v20.16b, v3.16b, #13\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[4]\n\t" + "dup v2.16b, v14.b[4]\n\t" + "dup v1.16b, v13.b[4]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #12\n\t" + "ext v6.16b, v20.16b, v3.16b, #12\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[5]\n\t" + "dup v2.16b, v14.b[5]\n\t" + "dup v1.16b, v13.b[5]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #11\n\t" + "ext v6.16b, v20.16b, v3.16b, #11\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[6]\n\t" + "dup v2.16b, v14.b[6]\n\t" + "dup v1.16b, v13.b[6]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #10\n\t" + "ext v6.16b, v20.16b, v3.16b, #10\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[7]\n\t" + "dup v2.16b, v14.b[7]\n\t" + "dup v1.16b, v13.b[7]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #9\n\t" + "ext v6.16b, v20.16b, v3.16b, #9\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[8]\n\t" + "dup v2.16b, v14.b[8]\n\t" + "dup v1.16b, v13.b[8]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #8\n\t" + "ext v6.16b, v20.16b, v3.16b, #8\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[9]\n\t" + "dup v2.16b, v14.b[9]\n\t" + "dup v1.16b, v13.b[9]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #7\n\t" + "ext v6.16b, v20.16b, v3.16b, #7\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[10]\n\t" + "dup v2.16b, v14.b[10]\n\t" + "dup v1.16b, v13.b[10]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #6\n\t" + "ext v6.16b, v20.16b, v3.16b, #6\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[11]\n\t" + "dup v2.16b, v14.b[11]\n\t" + "dup v1.16b, v13.b[11]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #5\n\t" + "ext v6.16b, v20.16b, v3.16b, #5\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[12]\n\t" + "dup v2.16b, v14.b[12]\n\t" + "dup v1.16b, v13.b[12]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #4\n\t" + "ext v6.16b, v20.16b, v3.16b, #4\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[13]\n\t" + "dup v2.16b, v14.b[13]\n\t" + "dup v1.16b, v13.b[13]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #3\n\t" + "ext v6.16b, v20.16b, v3.16b, #3\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[14]\n\t" + "dup v2.16b, v14.b[14]\n\t" + "dup v1.16b, v13.b[14]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #2\n\t" + "ext v6.16b, v20.16b, v3.16b, #2\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "dup v0.16b, v12.b[15]\n\t" + "dup v2.16b, v14.b[15]\n\t" + "dup v1.16b, v13.b[15]\n\t" + "pmul v3.16b, v15.16b, v0.16b\n\t" + "pmul v5.16b, v17.16b, v2.16b\n\t" + "pmul v4.16b, v16.16b, v1.16b\n\t" + "eor v5.16b, v5.16b, v3.16b\n\t" + "eor v5.16b, v5.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "shl v6.16b, v5.16b, #4\n\t" + "ushr v7.16b, v5.16b, #4\n\t" + "eor v3.16b, v3.16b, v6.16b\n\t" + "eor v11.16b, v4.16b, v7.16b\n\t" + "ext v7.16b, v3.16b, v20.16b, #1\n\t" + "ext v6.16b, v20.16b, v3.16b, #1\n\t" + "eor v9.16b, v9.16b, v7.16b\n\t" + "eor v8.16b, v8.16b, v6.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + /* Reduce 254-bit number */ + "shl v0.16b, v9.16b, #1\n\t" + "shl v1.16b, v9.16b, #2\n\t" + "shl v2.16b, v9.16b, #7\n\t" + "ushr v3.16b, v9.16b, #7\n\t" + "ushr v4.16b, v9.16b, #6\n\t" + "ushr v5.16b, v9.16b, #1\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "ext v0.16b, v20.16b, v3.16b, #15\n\t" + "ext v1.16b, v20.16b, v4.16b, #15\n\t" + "ext v2.16b, v20.16b, v5.16b, #15\n\t" + "ext v4.16b, v4.16b, v20.16b, #15\n\t" + "ext v5.16b, v5.16b, v20.16b, #15\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "eor v8.16b, v8.16b, v2.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v3.16b, v4.16b, v5.16b\n\t" + "shl v0.2d, v3.2d, #1\n\t" + "shl v1.2d, v3.2d, #2\n\t" + "shl v2.2d, v3.2d, #7\n\t" + "eor v3.16b, v3.16b, v0.16b\n\t" + "eor v1.16b, v1.16b, v2.16b\n\t" + "eor v8.16b, v8.16b, v3.16b\n\t" + "eor v18.16b, v8.16b, v1.16b\n\t" + "subs %x[len], %x[len], #16\n\t" + "b.ne L_GCM_gmult_len_NEON_start_block_%=\n\t" + "rbit v18.16b, v18.16b\n\t" + "st1 {v18.2d}, [%x[x]]\n\t" + : [x] "+r" (x), [len] "+r" (len) + : [h] "r" (h), [data] "r" (data) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20" + ); +} + +void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_GCM_encrypt_NEON(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + __asm__ __volatile__ ( + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "ld1 {v2.2d}, [%x[ctr]]\n\t" + "rev32 v2.16b, v2.16b\n\t" + "mov w6, v2.s[3]\n\t" + "cmp %x[len], #0x40\n\t" + "b.lt L_AES_GCM_encrypt_NEON_start_2_%=\n\t" + "mov x7, v2.d[0]\n\t" + "mov x8, v2.d[1]\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_loop_4_%=: \n\t" + "mov x12, %x[ks]\n\t" + "ld1 {v4.2d}, [x12], #16\n\t" + "mov v8.d[0], x7\n\t" + "mov v8.d[1], x8\n\t" + /* Round: 0 - XOR in key schedule */ + "add w6, w6, #1\n\t" + "mov v8.s[3], w6\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "add w6, w6, #1\n\t" + "mov v8.s[3], w6\n\t" + "eor v1.16b, v8.16b, v4.16b\n\t" + "add w6, w6, #1\n\t" + "mov v8.s[3], w6\n\t" + "eor v2.16b, v8.16b, v4.16b\n\t" + "add w6, w6, #1\n\t" + "mov v8.s[3], w6\n\t" + "eor v3.16b, v8.16b, v4.16b\n\t" + "sub w11, %w[nr], #2\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x12], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "movi v4.16b, #27\n\t" + "and v8.16b, v8.16b, v4.16b\n\t" + "and v9.16b, v9.16b, v4.16b\n\t" + "and v10.16b, v10.16b, v4.16b\n\t" + "and v11.16b, v11.16b, v4.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "eor v6.16b, v10.16b, v2.16b\n\t" + "eor v7.16b, v11.16b, v3.16b\n\t" + "shl v12.4s, v4.4s, #8\n\t" + "shl v13.4s, v5.4s, #8\n\t" + "shl v14.4s, v6.4s, #8\n\t" + "shl v15.4s, v7.4s, #8\n\t" + "sri v12.4s, v4.4s, #24\n\t" + "sri v13.4s, v5.4s, #24\n\t" + "sri v14.4s, v6.4s, #24\n\t" + "sri v15.4s, v7.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "shl v6.4s, v2.4s, #24\n\t" + "shl v7.4s, v3.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "sri v6.4s, v2.4s, #8\n\t" + "sri v7.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x12], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + /* Round Done */ + "subs w11, w11, #2\n\t" + "b.ne L_AES_GCM_encrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x12], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x12], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%x[in]], #0x40\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "sub %x[len], %x[len], #0x40\n\t" + "cmp %x[len], #0x40\n\t" + "b.ge L_AES_GCM_encrypt_NEON_loop_4_%=\n\t" + "mov v2.d[0], x7\n\t" + "mov v2.d[1], x8\n\t" + "mov v2.s[3], w6\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_start_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "cmp %x[len], #16\n\t" + "b.eq L_AES_GCM_encrypt_NEON_start_1_%=\n\t" + "b.lt L_AES_GCM_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_loop_2_%=: \n\t" + "mov x12, %x[ks]\n\t" + "ld1 {v4.2d}, [x12], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "add w6, w6, #1\n\t" + "mov v2.s[3], w6\n\t" + "eor v0.16b, v2.16b, v4.16b\n\t" + "add w6, w6, #1\n\t" + "mov v2.s[3], w6\n\t" + "eor v1.16b, v2.16b, v4.16b\n\t" + "sub w11, %w[nr], #2\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_loop_nr_2_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x12], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v10.16b, v0.16b, #1\n\t" + "shl v11.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "shl v10.4s, v4.4s, #8\n\t" + "shl v11.4s, v5.4s, #8\n\t" + "sri v10.4s, v4.4s, #24\n\t" + "sri v11.4s, v5.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x12], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* Round Done */ + "subs w11, w11, #2\n\t" + "b.ne L_AES_GCM_encrypt_NEON_loop_nr_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x12], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x12], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "ld1 {v4.16b, v5.16b}, [%x[in]], #32\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "sub %x[len], %x[len], #32\n\t" + "cmp %x[len], #0\n\t" + "b.eq L_AES_GCM_encrypt_NEON_data_done_%=\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_start_1_%=: \n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x12, %x[ks]\n\t" + "add w6, w6, #1\n\t" + "ld1 {v4.2d}, [x12], #16\n\t" + "mov v2.s[3], w6\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v0.16b, v2.16b, v4.16b\n\t" + "sub w11, %w[nr], #2\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x12], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x12], #16\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w11, w11, #2\n\t" + "b.ne L_AES_GCM_encrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x12], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x12], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "ld1 {v4.16b}, [%x[in]], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "\n" + "L_AES_GCM_encrypt_NEON_data_done_%=: \n\t" + "rev32 v2.16b, v2.16b\n\t" + "st1 {v2.2d}, [%x[ctr]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x6", "x7", "x8", "x11", "x12", "v0", "v1", "v2", + "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31" + ); +} + +#endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AES_XTS +void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, + byte* key, byte* key2, byte* tmp, int nr) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x17, #0x87\n\t" + "ld1 {v2.2d}, [%x[i]]\n\t" + "ld1 {v4.2d}, [%x[key2]]\n\t" + "rev32 v2.16b, v2.16b\n\t" + "add x22, %x[key2], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v2.16b, v2.16b, v4.16b\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_nr_tweak_%=: \n\t" + "eor v8.16b, v2.16b, v12.16b\n\t" + "eor v9.16b, v2.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v2.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v2.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v2.16b, v2.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v2.16b, v2.16b, v9.16b\n\t" + "tbl v2.16b, {v2.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "shl v9.16b, v2.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v2.8h\n\t" + "eor v11.16b, v10.16b, v2.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v2.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v2.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v2.16b, v10.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v8.16b\n\t" + "subs w21, w21, #2\n\t" + "b.ne L_AES_XTS_encrypt_NEON_loop_nr_tweak_%=\n\t" + "eor v8.16b, v2.16b, v12.16b\n\t" + "eor v9.16b, v2.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v2.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v2.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v2.16b, v2.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v2.16b, v2.16b, v9.16b\n\t" + "tbl v2.16b, {v2.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + /* XOR in Key Schedule */ + "eor v2.16b, v2.16b, v4.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "mov x8, v2.d[0]\n\t" + "mov x9, v2.d[1]\n\t" + "cmp %w[sz], #0x40\n\t" + "b.lt L_AES_XTS_encrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_4_%=: \n\t" + "mov x22, %x[key]\n\t" + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t" + "ld1 {v4.16b}, [x22], #16\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x13, x11, x10, #63\n\t" + "eor x12, x16, x10, lsl 1\n\t" + "and x16, x17, x13, asr 63\n\t" + "extr x15, x13, x12, #63\n\t" + "eor x14, x16, x12, lsl 1\n\t" + "mov v8.d[0], x8\n\t" + "mov v8.d[1], x9\n\t" + "mov v9.d[0], x10\n\t" + "mov v9.d[1], x11\n\t" + "mov v10.d[0], x12\n\t" + "mov v10.d[1], x13\n\t" + "mov v11.d[0], x14\n\t" + "mov v11.d[1], x15\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x22], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "movi v4.16b, #27\n\t" + "and v8.16b, v8.16b, v4.16b\n\t" + "and v9.16b, v9.16b, v4.16b\n\t" + "and v10.16b, v10.16b, v4.16b\n\t" + "and v11.16b, v11.16b, v4.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "eor v6.16b, v10.16b, v2.16b\n\t" + "eor v7.16b, v11.16b, v3.16b\n\t" + "shl v12.4s, v4.4s, #8\n\t" + "shl v13.4s, v5.4s, #8\n\t" + "shl v14.4s, v6.4s, #8\n\t" + "shl v15.4s, v7.4s, #8\n\t" + "sri v12.4s, v4.4s, #24\n\t" + "sri v13.4s, v5.4s, #24\n\t" + "sri v14.4s, v6.4s, #24\n\t" + "sri v15.4s, v7.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "shl v6.4s, v2.4s, #24\n\t" + "shl v7.4s, v3.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "sri v6.4s, v2.4s, #8\n\t" + "sri v7.4s, v3.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "rev32 v2.8h, v2.8h\n\t" + "rev32 v3.8h, v3.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + /* Round Done */ + "subs w21, w21, #2\n\t" + "b.ne L_AES_XTS_encrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "movi v0.16b, #27\n\t" + "and v8.16b, v8.16b, v0.16b\n\t" + "and v9.16b, v9.16b, v0.16b\n\t" + "and v10.16b, v10.16b, v0.16b\n\t" + "and v11.16b, v11.16b, v0.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "eor v2.16b, v10.16b, v6.16b\n\t" + "eor v3.16b, v11.16b, v7.16b\n\t" + "shl v12.4s, v0.4s, #8\n\t" + "shl v13.4s, v1.4s, #8\n\t" + "shl v14.4s, v2.4s, #8\n\t" + "shl v15.4s, v3.4s, #8\n\t" + "sri v12.4s, v0.4s, #24\n\t" + "sri v13.4s, v1.4s, #24\n\t" + "sri v14.4s, v2.4s, #24\n\t" + "sri v15.4s, v3.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "shl v2.4s, v6.4s, #24\n\t" + "shl v3.4s, v7.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "sri v2.4s, v6.4s, #8\n\t" + "sri v3.4s, v7.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "rev32 v6.8h, v6.8h\n\t" + "rev32 v7.8h, v7.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x22], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "mov v8.d[0], x8\n\t" + "mov v8.d[1], x9\n\t" + "mov v9.d[0], x10\n\t" + "mov v9.d[1], x11\n\t" + "mov v10.d[0], x12\n\t" + "mov v10.d[1], x13\n\t" + "mov v11.d[0], x14\n\t" + "mov v11.d[1], x15\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "and x16, x17, x15, asr 63\n\t" + "extr x9, x15, x14, #63\n\t" + "eor x8, x16, x14, lsl 1\n\t" + "sub %w[sz], %w[sz], #0x40\n\t" + "cmp %w[sz], #0x40\n\t" + "b.ge L_AES_XTS_encrypt_NEON_loop_4_%=\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_start_2_%=: \n\t" + "cmp %w[sz], #32\n\t" + "b.lt L_AES_XTS_encrypt_NEON_start_1_%=\n\t" + "mov x22, %x[key]\n\t" + "ld1 {v0.16b, v1.16b}, [%x[in]], #32\n\t" + "ld1 {v4.16b}, [x22], #16\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x13, x11, x10, #63\n\t" + "eor x12, x16, x10, lsl 1\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "mov v3.d[0], x10\n\t" + "mov v3.d[1], x11\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "eor v1.16b, v1.16b, v3.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_nr_2_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x22], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v10.16b, v0.16b, #1\n\t" + "shl v11.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v4.16b, v8.16b, v0.16b\n\t" + "eor v5.16b, v9.16b, v1.16b\n\t" + "shl v10.4s, v4.4s, #8\n\t" + "shl v11.4s, v5.4s, #8\n\t" + "sri v10.4s, v4.4s, #24\n\t" + "sri v11.4s, v5.4s, #24\n\t" + "shl v4.4s, v0.4s, #24\n\t" + "shl v5.4s, v1.4s, #24\n\t" + "sri v4.4s, v0.4s, #8\n\t" + "sri v5.4s, v1.4s, #8\n\t" + "rev32 v0.8h, v0.8h\n\t" + "rev32 v1.8h, v1.8h\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* Round Done */ + "subs w21, w21, #2\n\t" + "b.ne L_AES_XTS_encrypt_NEON_loop_nr_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[shuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v10.16b, v4.16b, #1\n\t" + "shl v11.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v15.16b\n\t" + "and v9.16b, v9.16b, v15.16b\n\t" + "eor v8.16b, v8.16b, v10.16b\n\t" + "eor v9.16b, v9.16b, v11.16b\n\t" + "eor v0.16b, v8.16b, v4.16b\n\t" + "eor v1.16b, v9.16b, v5.16b\n\t" + "shl v10.4s, v0.4s, #8\n\t" + "shl v11.4s, v1.4s, #8\n\t" + "sri v10.4s, v0.4s, #24\n\t" + "sri v11.4s, v1.4s, #24\n\t" + "shl v0.4s, v4.4s, #24\n\t" + "shl v1.4s, v5.4s, #24\n\t" + "sri v0.4s, v4.4s, #8\n\t" + "sri v1.4s, v5.4s, #8\n\t" + "rev32 v4.8h, v4.8h\n\t" + "rev32 v5.8h, v5.8h\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x22], #16\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* Round Done */ + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[shuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "eor v1.16b, v1.16b, v3.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x9, x11, x10, #63\n\t" + "eor x8, x16, x10, lsl 1\n\t" + "sub %w[sz], %w[sz], #32\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_start_1_%=: \n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "cmp %w[sz], #16\n\t" + "b.lt L_AES_XTS_encrypt_NEON_start_partial_%=\n\t" + "mov x22, %x[key]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w21, w21, #2\n\t" + "b.ne L_AES_XTS_encrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "subs %w[sz], %w[sz], #16\n\t" + "b.eq L_AES_XTS_encrypt_NEON_data_done_%=\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x9, x9, x8, #63\n\t" + "eor x8, x16, x8, lsl 1\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_start_partial_%=: \n\t" + "cbz %w[sz], L_AES_XTS_encrypt_NEON_data_done_%=\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "mov x22, %x[key]\n\t" + "sub %x[out], %x[out], #16\n\t" + "ld1 {v0.16b}, [%x[out]], #16\n\t" + "st1 {v0.2d}, [%x[tmp]]\n\t" + "mov w16, %w[sz]\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_start_byte_%=: \n\t" + "ldrb w10, [%x[tmp]]\n\t" + "ldrb w11, [%x[in]], #1\n\t" + "strb w10, [%x[out]], #1\n\t" + "strb w11, [%x[tmp]], #1\n\t" + "subs w16, w16, #1\n\t" + "b.gt L_AES_XTS_encrypt_NEON_start_byte_%=\n\t" + "sub %x[out], %x[out], %x[sz]\n\t" + "sub %x[tmp], %x[tmp], %x[sz]\n\t" + "sub %x[out], %x[out], #16\n\t" + "ld1 {v0.2d}, [%x[tmp]]\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w21, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_loop_nr_partial_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v0.8h\n\t" + "eor v11.16b, v10.16b, v0.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v0.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v0.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v0.16b, v10.16b, v9.16b\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "subs w21, w21, #2\n\t" + "b.ne L_AES_XTS_encrypt_NEON_loop_nr_partial_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v0.2d}, [x22], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v0.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x22], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]]\n\t" + "\n" + "L_AES_XTS_encrypt_NEON_data_done_%=: \n\t" + "ldp x29, x30, [sp], #32\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), + [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) + : [in] "r" (in), [i] "r" (i), [te] "r" (te), [shuffle] "r" (shuffle) + : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x21", "x22", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#ifdef HAVE_AES_DECRYPT +void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, const byte* i, + byte* key, byte* key2, byte* tmp, int nr) +{ + const word8* te = L_AES_ARM64_NEON_te; + const word8* td = L_AES_ARM64_NEON_td; + const word8* shuffle = L_AES_ARM64_NEON_shift_rows_shuffle; + const word8* invshuffle = L_AES_ARM64_NEON_shift_rows_invshuffle; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[te]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[te]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[te]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[te]]\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "ld1 {v3.2d}, [%[shuffle]]\n\t" + "mov x17, #0x87\n\t" + "ands w19, %w[sz], #15\n\t" + "cset w16, ne\n\t" + "lsl w16, w16, #4\n\t" + "sub %w[sz], %w[sz], w16\n\t" + "ld1 {v2.2d}, [%x[i]]\n\t" + "ld1 {v4.2d}, [%x[key2]]\n\t" + "rev32 v2.16b, v2.16b\n\t" + "add x25, %x[key2], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor v2.16b, v2.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_tweak_%=: \n\t" + "eor v8.16b, v2.16b, v12.16b\n\t" + "eor v9.16b, v2.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v2.2d}, [x25], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v2.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v2.16b, v2.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v2.16b, v2.16b, v9.16b\n\t" + "tbl v2.16b, {v2.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "shl v9.16b, v2.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v2.8h\n\t" + "eor v11.16b, v10.16b, v2.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v2.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v4.16b\n\t" + "sri v9.4s, v2.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v2.16b, v10.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v8.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_tweak_%=\n\t" + "eor v8.16b, v2.16b, v12.16b\n\t" + "eor v9.16b, v2.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "ld1 {v2.2d}, [x25], #16\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "rev32 v8.8h, v4.8h\n\t" + "eor v11.16b, v10.16b, v4.16b\n\t" + "eor v10.16b, v10.16b, v8.16b\n\t" + "shl v9.4s, v4.4s, #24\n\t" + "shl v8.4s, v11.4s, #8\n\t" + /* XOR in Key Schedule */ + "eor v10.16b, v10.16b, v2.16b\n\t" + "sri v9.4s, v4.4s, #8\n\t" + "sri v8.4s, v11.4s, #24\n\t" + "eor v4.16b, v10.16b, v9.16b\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v2.16b, v2.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v2.16b, v2.16b, v9.16b\n\t" + "tbl v2.16b, {v2.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v2.16b, v2.16b, v4.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "mov x8, v2.d[0]\n\t" + "mov x9, v2.d[1]\n\t" + "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%[td]], #0x40\n\t" + "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%[td]], #0x40\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%[td]], #0x40\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + "ld1 {v3.2d}, [%[invshuffle]]\n\t" + "cmp %w[sz], #0x40\n\t" + "b.lt L_AES_XTS_decrypt_NEON_start_2_%=\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_4_%=: \n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[in]], #0x40\n\t" + "ld1 {v4.16b}, [x25], #16\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x13, x11, x10, #63\n\t" + "eor x12, x16, x10, lsl 1\n\t" + "and x16, x17, x13, asr 63\n\t" + "extr x15, x13, x12, #63\n\t" + "eor x14, x16, x12, lsl 1\n\t" + "mov v8.d[0], x8\n\t" + "mov v8.d[1], x9\n\t" + "mov v9.d[0], x10\n\t" + "mov v9.d[1], x11\n\t" + "mov v10.d[0], x12\n\t" + "mov v10.d[1], x13\n\t" + "mov v11.d[0], x14\n\t" + "mov v11.d[1], x15\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_4_%=: \n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "ushr v14.16b, v6.16b, #6\n\t" + "ushr v15.16b, v7.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "shl v2.16b, v6.16b, #2\n\t" + "shl v3.16b, v7.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "ushr v2.16b, v6.16b, #5\n\t" + "ushr v3.16b, v7.16b, #5\n\t" + "pmul v0.16b, v0.16b, v28.16b\n\t" + "pmul v1.16b, v1.16b, v28.16b\n\t" + "pmul v2.16b, v2.16b, v28.16b\n\t" + "pmul v3.16b, v3.16b, v28.16b\n\t" + "shl v28.16b, v4.16b, #3\n\t" + "shl v29.16b, v5.16b, #3\n\t" + "shl v30.16b, v6.16b, #3\n\t" + "shl v31.16b, v7.16b, #3\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v0.16b\n\t" + "eor v29.16b, v9.16b, v1.16b\n\t" + "eor v30.16b, v10.16b, v2.16b\n\t" + "eor v31.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v10.16b, v14.16b, v2.16b\n\t" + "eor v11.16b, v15.16b, v3.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v4.16b\n\t" + "eor v29.16b, v29.16b, v5.16b\n\t" + "eor v30.16b, v30.16b, v6.16b\n\t" + "eor v31.16b, v31.16b, v7.16b\n\t" + "shl v4.4s, v28.4s, #8\n\t" + "shl v5.4s, v29.4s, #8\n\t" + "shl v6.4s, v30.4s, #8\n\t" + "shl v7.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v4.4s, v28.4s, #24\n\t" + "sri v5.4s, v29.4s, #24\n\t" + "sri v6.4s, v30.4s, #24\n\t" + "sri v7.4s, v31.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + "shl v28.4s, v0.4s, #24\n\t" + "shl v29.4s, v1.4s, #24\n\t" + "shl v30.4s, v2.4s, #24\n\t" + "shl v31.4s, v3.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "sri v28.4s, v0.4s, #8\n\t" + "sri v29.4s, v1.4s, #8\n\t" + "sri v30.4s, v2.4s, #8\n\t" + "sri v31.4s, v3.4s, #8\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x25], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "sshr v10.16b, v2.16b, #7\n\t" + "sshr v11.16b, v3.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "shl v14.16b, v2.16b, #1\n\t" + "shl v15.16b, v3.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v0.16b, #6\n\t" + "ushr v13.16b, v1.16b, #6\n\t" + "ushr v14.16b, v2.16b, #6\n\t" + "ushr v15.16b, v3.16b, #6\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "shl v5.16b, v1.16b, #2\n\t" + "shl v6.16b, v2.16b, #2\n\t" + "shl v7.16b, v3.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "eor v14.16b, v14.16b, v6.16b\n\t" + "eor v15.16b, v15.16b, v7.16b\n\t" + "ushr v4.16b, v0.16b, #5\n\t" + "ushr v5.16b, v1.16b, #5\n\t" + "ushr v6.16b, v2.16b, #5\n\t" + "ushr v7.16b, v3.16b, #5\n\t" + "pmul v4.16b, v4.16b, v28.16b\n\t" + "pmul v5.16b, v5.16b, v28.16b\n\t" + "pmul v6.16b, v6.16b, v28.16b\n\t" + "pmul v7.16b, v7.16b, v28.16b\n\t" + "shl v28.16b, v0.16b, #3\n\t" + "shl v29.16b, v1.16b, #3\n\t" + "shl v30.16b, v2.16b, #3\n\t" + "shl v31.16b, v3.16b, #3\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v4.16b\n\t" + "eor v29.16b, v9.16b, v5.16b\n\t" + "eor v30.16b, v10.16b, v6.16b\n\t" + "eor v31.16b, v11.16b, v7.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v6.16b, v6.16b, v2.16b\n\t" + "eor v7.16b, v7.16b, v3.16b\n\t" + "eor v8.16b, v12.16b, v4.16b\n\t" + "eor v9.16b, v13.16b, v5.16b\n\t" + "eor v10.16b, v14.16b, v6.16b\n\t" + "eor v11.16b, v15.16b, v7.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v0.16b\n\t" + "eor v29.16b, v29.16b, v1.16b\n\t" + "eor v30.16b, v30.16b, v2.16b\n\t" + "eor v31.16b, v31.16b, v3.16b\n\t" + "shl v0.4s, v28.4s, #8\n\t" + "shl v1.4s, v29.4s, #8\n\t" + "shl v2.4s, v30.4s, #8\n\t" + "shl v3.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v0.4s, v28.4s, #24\n\t" + "sri v1.4s, v29.4s, #24\n\t" + "sri v2.4s, v30.4s, #24\n\t" + "sri v3.4s, v31.4s, #24\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "eor v2.16b, v2.16b, v14.16b\n\t" + "eor v3.16b, v3.16b, v15.16b\n\t" + "shl v28.4s, v4.4s, #24\n\t" + "shl v29.4s, v5.4s, #24\n\t" + "shl v30.4s, v6.4s, #24\n\t" + "shl v31.4s, v7.4s, #24\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "sri v28.4s, v4.4s, #8\n\t" + "sri v29.4s, v5.4s, #8\n\t" + "sri v30.4s, v6.4s, #8\n\t" + "sri v31.4s, v7.4s, #8\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_4_%=\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v6.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v2.16b\n\t" + "tbl v7.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v3.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "eor v10.16b, v2.16b, v12.16b\n\t" + "eor v11.16b, v3.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v13.16b\n\t" + "eor v9.16b, v1.16b, v13.16b\n\t" + "eor v10.16b, v2.16b, v13.16b\n\t" + "eor v11.16b, v3.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "eor v10.16b, v2.16b, v14.16b\n\t" + "eor v11.16b, v3.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "orr v6.16b, v6.16b, v10.16b\n\t" + "orr v7.16b, v7.16b, v11.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "tbl v6.16b, {v6.16b}, v0.16b\n\t" + "tbl v7.16b, {v7.16b}, v0.16b\n\t" + "movi v28.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "sshr v10.16b, v6.16b, #7\n\t" + "sshr v11.16b, v7.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "shl v14.16b, v6.16b, #1\n\t" + "shl v15.16b, v7.16b, #1\n\t" + "and v8.16b, v8.16b, v28.16b\n\t" + "and v9.16b, v9.16b, v28.16b\n\t" + "and v10.16b, v10.16b, v28.16b\n\t" + "and v11.16b, v11.16b, v28.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "eor v10.16b, v10.16b, v14.16b\n\t" + "eor v11.16b, v11.16b, v15.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "ushr v14.16b, v6.16b, #6\n\t" + "ushr v15.16b, v7.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "shl v2.16b, v6.16b, #2\n\t" + "shl v3.16b, v7.16b, #2\n\t" + "pmul v12.16b, v12.16b, v28.16b\n\t" + "pmul v13.16b, v13.16b, v28.16b\n\t" + "pmul v14.16b, v14.16b, v28.16b\n\t" + "pmul v15.16b, v15.16b, v28.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "ushr v2.16b, v6.16b, #5\n\t" + "ushr v3.16b, v7.16b, #5\n\t" + "pmul v0.16b, v0.16b, v28.16b\n\t" + "pmul v1.16b, v1.16b, v28.16b\n\t" + "pmul v2.16b, v2.16b, v28.16b\n\t" + "pmul v3.16b, v3.16b, v28.16b\n\t" + "shl v28.16b, v4.16b, #3\n\t" + "shl v29.16b, v5.16b, #3\n\t" + "shl v30.16b, v6.16b, #3\n\t" + "shl v31.16b, v7.16b, #3\n\t" + "eor v0.16b, v0.16b, v28.16b\n\t" + "eor v1.16b, v1.16b, v29.16b\n\t" + "eor v2.16b, v2.16b, v30.16b\n\t" + "eor v3.16b, v3.16b, v31.16b\n\t" + "eor v28.16b, v8.16b, v0.16b\n\t" + "eor v29.16b, v9.16b, v1.16b\n\t" + "eor v30.16b, v10.16b, v2.16b\n\t" + "eor v31.16b, v11.16b, v3.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v2.16b, v2.16b, v6.16b\n\t" + "eor v3.16b, v3.16b, v7.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v10.16b, v14.16b, v2.16b\n\t" + "eor v11.16b, v15.16b, v3.16b\n\t" + "eor v12.16b, v12.16b, v28.16b\n\t" + "eor v13.16b, v13.16b, v29.16b\n\t" + "eor v14.16b, v14.16b, v30.16b\n\t" + "eor v15.16b, v15.16b, v31.16b\n\t" + "eor v28.16b, v28.16b, v4.16b\n\t" + "eor v29.16b, v29.16b, v5.16b\n\t" + "eor v30.16b, v30.16b, v6.16b\n\t" + "eor v31.16b, v31.16b, v7.16b\n\t" + "shl v4.4s, v28.4s, #8\n\t" + "shl v5.4s, v29.4s, #8\n\t" + "shl v6.4s, v30.4s, #8\n\t" + "shl v7.4s, v31.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "rev32 v10.8h, v10.8h\n\t" + "rev32 v11.8h, v11.8h\n\t" + "sri v4.4s, v28.4s, #24\n\t" + "sri v5.4s, v29.4s, #24\n\t" + "sri v6.4s, v30.4s, #24\n\t" + "sri v7.4s, v31.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "eor v6.16b, v6.16b, v14.16b\n\t" + "eor v7.16b, v7.16b, v15.16b\n\t" + "shl v28.4s, v0.4s, #24\n\t" + "shl v29.4s, v1.4s, #24\n\t" + "shl v30.4s, v2.4s, #24\n\t" + "shl v31.4s, v3.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "eor v6.16b, v6.16b, v10.16b\n\t" + "eor v7.16b, v7.16b, v11.16b\n\t" + "sri v28.4s, v0.4s, #8\n\t" + "sri v29.4s, v1.4s, #8\n\t" + "sri v30.4s, v2.4s, #8\n\t" + "sri v31.4s, v3.4s, #8\n\t" + "eor v4.16b, v4.16b, v28.16b\n\t" + "eor v5.16b, v5.16b, v29.16b\n\t" + "eor v6.16b, v6.16b, v30.16b\n\t" + "eor v7.16b, v7.16b, v31.16b\n\t" + "ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [%[td]]\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x25], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + "eor v6.16b, v6.16b, v0.16b\n\t" + "eor v7.16b, v7.16b, v0.16b\n\t" + /* Round Done */ + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v2.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v6.16b\n\t" + "tbl v3.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v7.16b\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "eor v10.16b, v6.16b, v12.16b\n\t" + "eor v11.16b, v7.16b, v12.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "tbl v10.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v10.16b\n\t" + "tbl v11.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v13.16b\n\t" + "eor v9.16b, v5.16b, v13.16b\n\t" + "eor v10.16b, v6.16b, v13.16b\n\t" + "eor v11.16b, v7.16b, v13.16b\n\t" + "tbl v8.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "eor v10.16b, v6.16b, v14.16b\n\t" + "eor v11.16b, v7.16b, v14.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "tbl v11.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "orr v2.16b, v2.16b, v10.16b\n\t" + "orr v3.16b, v3.16b, v11.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "tbl v2.16b, {v2.16b}, v4.16b\n\t" + "tbl v3.16b, {v3.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "eor v2.16b, v2.16b, v4.16b\n\t" + "eor v3.16b, v3.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "rev32 v2.16b, v2.16b\n\t" + "rev32 v3.16b, v3.16b\n\t" + "mov v8.d[0], x8\n\t" + "mov v8.d[1], x9\n\t" + "mov v9.d[0], x10\n\t" + "mov v9.d[1], x11\n\t" + "mov v10.d[0], x12\n\t" + "mov v10.d[1], x13\n\t" + "mov v11.d[0], x14\n\t" + "mov v11.d[1], x15\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "eor v2.16b, v2.16b, v10.16b\n\t" + "eor v3.16b, v3.16b, v11.16b\n\t" + "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%x[out]], #0x40\n\t" + "and x16, x17, x15, asr 63\n\t" + "extr x9, x15, x14, #63\n\t" + "eor x8, x16, x14, lsl 1\n\t" + "sub %w[sz], %w[sz], #0x40\n\t" + "cmp %w[sz], #0x40\n\t" + "b.ge L_AES_XTS_decrypt_NEON_loop_4_%=\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "movi v15.16b, #27\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_2_%=: \n\t" + "cmp %w[sz], #32\n\t" + "b.lt L_AES_XTS_decrypt_NEON_start_1_%=\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b, v1.16b}, [%x[in]], #32\n\t" + "ld1 {v4.16b}, [x25], #16\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x13, x11, x10, #63\n\t" + "eor x12, x16, x10, lsl 1\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "mov v3.d[0], x10\n\t" + "mov v3.d[1], x11\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "eor v1.16b, v1.16b, v3.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_2_%=: \n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "pmul v0.16b, v0.16b, v10.16b\n\t" + "pmul v1.16b, v1.16b, v10.16b\n\t" + "shl v10.16b, v4.16b, #3\n\t" + "shl v11.16b, v5.16b, #3\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v0.16b\n\t" + "eor v11.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v5.16b\n\t" + "shl v4.4s, v10.4s, #8\n\t" + "shl v5.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v4.4s, v10.4s, #24\n\t" + "sri v5.4s, v11.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "shl v10.4s, v0.4s, #24\n\t" + "shl v11.4s, v1.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "sri v10.4s, v0.4s, #8\n\t" + "sri v11.4s, v1.4s, #8\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x25], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v0.16b, #7\n\t" + "sshr v9.16b, v1.16b, #7\n\t" + "shl v12.16b, v0.16b, #1\n\t" + "shl v13.16b, v1.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v0.16b, #6\n\t" + "ushr v13.16b, v1.16b, #6\n\t" + "shl v4.16b, v0.16b, #2\n\t" + "shl v5.16b, v1.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v4.16b\n\t" + "eor v13.16b, v13.16b, v5.16b\n\t" + "ushr v4.16b, v0.16b, #5\n\t" + "ushr v5.16b, v1.16b, #5\n\t" + "pmul v4.16b, v4.16b, v10.16b\n\t" + "pmul v5.16b, v5.16b, v10.16b\n\t" + "shl v10.16b, v0.16b, #3\n\t" + "shl v11.16b, v1.16b, #3\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v4.16b\n\t" + "eor v11.16b, v9.16b, v5.16b\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v1.16b\n\t" + "eor v8.16b, v12.16b, v4.16b\n\t" + "eor v9.16b, v13.16b, v5.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v0.16b\n\t" + "eor v11.16b, v11.16b, v1.16b\n\t" + "shl v0.4s, v10.4s, #8\n\t" + "shl v1.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v0.4s, v10.4s, #24\n\t" + "sri v1.4s, v11.4s, #24\n\t" + "eor v0.16b, v0.16b, v12.16b\n\t" + "eor v1.16b, v1.16b, v13.16b\n\t" + "shl v10.4s, v4.4s, #24\n\t" + "shl v11.4s, v5.4s, #24\n\t" + "eor v0.16b, v0.16b, v8.16b\n\t" + "eor v1.16b, v1.16b, v9.16b\n\t" + "sri v10.4s, v4.4s, #8\n\t" + "sri v11.4s, v5.4s, #8\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_2_%=\n\t" + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v1.16b, v12.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v5.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v1.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v0.16b, v13.16b\n\t" + "eor v11.16b, v1.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "eor v8.16b, v0.16b, v14.16b\n\t" + "eor v9.16b, v1.16b, v14.16b\n\t" + "orr v4.16b, v4.16b, v10.16b\n\t" + "orr v5.16b, v5.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v5.16b, v5.16b, v9.16b\n\t" + "ld1 {v0.16b}, [%[invshuffle]]\n\t" + "tbl v4.16b, {v4.16b}, v0.16b\n\t" + "tbl v5.16b, {v5.16b}, v0.16b\n\t" + "movi v10.16b, #27\n\t" + "sshr v8.16b, v4.16b, #7\n\t" + "sshr v9.16b, v5.16b, #7\n\t" + "shl v12.16b, v4.16b, #1\n\t" + "shl v13.16b, v5.16b, #1\n\t" + "and v8.16b, v8.16b, v10.16b\n\t" + "and v9.16b, v9.16b, v10.16b\n\t" + "eor v8.16b, v8.16b, v12.16b\n\t" + "eor v9.16b, v9.16b, v13.16b\n\t" + "ushr v12.16b, v4.16b, #6\n\t" + "ushr v13.16b, v5.16b, #6\n\t" + "shl v0.16b, v4.16b, #2\n\t" + "shl v1.16b, v5.16b, #2\n\t" + "pmul v12.16b, v12.16b, v10.16b\n\t" + "pmul v13.16b, v13.16b, v10.16b\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "ushr v0.16b, v4.16b, #5\n\t" + "ushr v1.16b, v5.16b, #5\n\t" + "pmul v0.16b, v0.16b, v10.16b\n\t" + "pmul v1.16b, v1.16b, v10.16b\n\t" + "shl v10.16b, v4.16b, #3\n\t" + "shl v11.16b, v5.16b, #3\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "eor v1.16b, v1.16b, v11.16b\n\t" + "eor v10.16b, v8.16b, v0.16b\n\t" + "eor v11.16b, v9.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v5.16b\n\t" + "eor v8.16b, v12.16b, v0.16b\n\t" + "eor v9.16b, v13.16b, v1.16b\n\t" + "eor v12.16b, v12.16b, v10.16b\n\t" + "eor v13.16b, v13.16b, v11.16b\n\t" + "eor v10.16b, v10.16b, v4.16b\n\t" + "eor v11.16b, v11.16b, v5.16b\n\t" + "shl v4.4s, v10.4s, #8\n\t" + "shl v5.4s, v11.4s, #8\n\t" + "rev32 v8.8h, v8.8h\n\t" + "rev32 v9.8h, v9.8h\n\t" + "sri v4.4s, v10.4s, #24\n\t" + "sri v5.4s, v11.4s, #24\n\t" + "eor v4.16b, v4.16b, v12.16b\n\t" + "eor v5.16b, v5.16b, v13.16b\n\t" + "shl v10.4s, v0.4s, #24\n\t" + "shl v11.4s, v1.4s, #24\n\t" + "eor v4.16b, v4.16b, v8.16b\n\t" + "eor v5.16b, v5.16b, v9.16b\n\t" + "sri v10.4s, v0.4s, #8\n\t" + "sri v11.4s, v1.4s, #8\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "eor v5.16b, v5.16b, v11.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v0.2d}, [x25], #16\n\t" + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v5.16b, v5.16b, v0.16b\n\t" + /* Round Done */ + "movi v12.16b, #0x40\n\t" + "movi v13.16b, #0x80\n\t" + "movi v14.16b, #0xc0\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v5.16b, v12.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v1.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v5.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v9.16b\n\t" + "eor v10.16b, v4.16b, v13.16b\n\t" + "eor v11.16b, v5.16b, v13.16b\n\t" + "tbl v10.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v10.16b\n\t" + "tbl v11.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v11.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "eor v8.16b, v4.16b, v14.16b\n\t" + "eor v9.16b, v5.16b, v14.16b\n\t" + "orr v0.16b, v0.16b, v10.16b\n\t" + "orr v1.16b, v1.16b, v11.16b\n\t" + "tbl v8.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v8.16b\n\t" + "tbl v9.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v9.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v1.16b, v1.16b, v9.16b\n\t" + "ld1 {v4.16b}, [%[invshuffle]]\n\t" + "tbl v0.16b, {v0.16b}, v4.16b\n\t" + "tbl v1.16b, {v1.16b}, v4.16b\n\t" + /* XOR in Key Schedule */ + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "eor v1.16b, v1.16b, v4.16b\n\t" + /* Round Done */ + "rev32 v0.16b, v0.16b\n\t" + "rev32 v1.16b, v1.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "eor v1.16b, v1.16b, v3.16b\n\t" + "st1 {v0.16b, v1.16b}, [%x[out]], #32\n\t" + "and x16, x17, x11, asr 63\n\t" + "extr x9, x11, x10, #63\n\t" + "eor x8, x16, x10, lsl 1\n\t" + "sub %w[sz], %w[sz], #32\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_1_%=: \n\t" + "ld1 {v3.2d}, [%[invshuffle]]\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "cmp %w[sz], #16\n\t" + "b.lt L_AES_XTS_decrypt_NEON_start_partial_%=\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]], #16\n\t" + "sub %w[sz], %w[sz], #16\n\t" + "cbz w19, L_AES_XTS_decrypt_NEON_data_done_%=\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x9, x9, x8, #63\n\t" + "eor x8, x16, x8, lsl 1\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_partial_%=: \n\t" + "mov %w[sz], w19\n\t" + "cbz %w[sz], L_AES_XTS_decrypt_NEON_data_done_%=\n\t" + "mov v2.d[0], x8\n\t" + "mov v2.d[1], x9\n\t" + "and x16, x17, x9, asr 63\n\t" + "extr x11, x9, x8, #63\n\t" + "eor x10, x16, x8, lsl 1\n\t" + "mov v1.d[0], x10\n\t" + "mov v1.d[1], x11\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.16b}, [%x[in]], #16\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_1_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v1.16b\n\t" + "st1 {v0.2d}, [%x[tmp]]\n\t" + "add %x[out], %x[out], #16\n\t" + "mov w16, %w[sz]\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_start_byte_%=: \n\t" + "ldrb w10, [%x[tmp]]\n\t" + "ldrb w11, [%x[in]], #1\n\t" + "strb w10, [%x[out]], #1\n\t" + "strb w11, [%x[tmp]], #1\n\t" + "subs w16, w16, #1\n\t" + "b.gt L_AES_XTS_decrypt_NEON_start_byte_%=\n\t" + "sub %x[out], %x[out], %x[sz]\n\t" + "sub %x[tmp], %x[tmp], %x[sz]\n\t" + "sub %x[out], %x[out], #16\n\t" + "mov x25, %x[key]\n\t" + "ld1 {v0.2d}, [%x[tmp]]\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v4.16b\n\t" + "sub w24, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=: \n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "sshr v10.16b, v0.16b, #7\n\t" + "ushr v11.16b, v0.16b, #6\n\t" + "ushr v8.16b, v0.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v0.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v0.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v0.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v0.16b\n\t" + "shl v0.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v0.4s, v9.4s, #24\n\t" + "eor v0.16b, v0.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v0.16b, v0.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v0.16b, v0.16b, v9.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "subs w24, w24, #2\n\t" + "b.ne L_AES_XTS_decrypt_NEON_loop_nr_partial_2_%=\n\t" + "eor v8.16b, v0.16b, v12.16b\n\t" + "eor v9.16b, v0.16b, v13.16b\n\t" + "eor v10.16b, v0.16b, v14.16b\n\t" + "tbl v4.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v0.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v4.16b, v4.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v4.16b, v4.16b, v9.16b\n\t" + "tbl v4.16b, {v4.16b}, v3.16b\n\t" + "sshr v10.16b, v4.16b, #7\n\t" + "ushr v11.16b, v4.16b, #6\n\t" + "ushr v8.16b, v4.16b, #5\n\t" + "and v10.16b, v10.16b, v15.16b\n\t" + "pmul v11.16b, v11.16b, v15.16b\n\t" + "pmul v8.16b, v8.16b, v15.16b\n\t" + "shl v9.16b, v4.16b, #1\n\t" + "eor v10.16b, v10.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #3\n\t" + "eor v8.16b, v8.16b, v9.16b\n\t" + "shl v9.16b, v4.16b, #2\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v10.16b, v8.16b\n\t" + "eor v8.16b, v8.16b, v4.16b\n\t" + "eor v10.16b, v11.16b, v8.16b\n\t" + "eor v11.16b, v11.16b, v9.16b\n\t" + "eor v9.16b, v9.16b, v4.16b\n\t" + "shl v4.4s, v9.4s, #8\n\t" + "rev32 v10.8h, v10.8h\n\t" + "sri v4.4s, v9.4s, #24\n\t" + "eor v4.16b, v4.16b, v11.16b\n\t" + "shl v9.4s, v8.4s, #24\n\t" + "eor v4.16b, v4.16b, v10.16b\n\t" + "sri v9.4s, v8.4s, #8\n\t" + "eor v4.16b, v4.16b, v9.16b\n\t" + "ld1 {v0.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v4.16b, v4.16b, v0.16b\n\t" + "eor v8.16b, v4.16b, v12.16b\n\t" + "eor v9.16b, v4.16b, v13.16b\n\t" + "eor v10.16b, v4.16b, v14.16b\n\t" + "tbl v0.16b, {v16.16b, v17.16b, v18.16b, v19.16b}, v4.16b\n\t" + "tbl v8.16b, {v20.16b, v21.16b, v22.16b, v23.16b}, v8.16b\n\t" + "tbl v9.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v9.16b\n\t" + "tbl v10.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v10.16b\n\t" + "orr v0.16b, v0.16b, v8.16b\n\t" + "orr v9.16b, v9.16b, v10.16b\n\t" + "orr v0.16b, v0.16b, v9.16b\n\t" + "tbl v0.16b, {v0.16b}, v3.16b\n\t" + "ld1 {v4.2d}, [x25], #16\n\t" + /* XOR in Key Schedule */ + "eor v0.16b, v0.16b, v4.16b\n\t" + "rev32 v0.16b, v0.16b\n\t" + "eor v0.16b, v0.16b, v2.16b\n\t" + "st1 {v0.16b}, [%x[out]]\n\t" + "\n" + "L_AES_XTS_decrypt_NEON_data_done_%=: \n\t" + "ldp x29, x30, [sp], #32\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), + [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) + : [in] "r" (in), [i] "r" (i), [te] "r" (te), [td] "r" (td), + [shuffle] "r" (shuffle), [invshuffle] "r" (invshuffle) + : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x19", "x24", "x25", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", + "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_XTS */ +#endif /* !WOLFSSL_ARMASM_NO_NEON */ +#ifndef WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP +#ifdef HAVE_AES_DECRYPT +static const word32 L_AES_ARM64_td[] = { + 0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e, + 0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303, + 0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c, + 0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3, + 0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0, + 0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9, + 0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259, + 0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8, + 0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971, + 0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a, + 0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f, + 0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b, + 0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8, + 0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab, + 0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708, + 0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682, + 0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2, + 0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe, + 0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb, + 0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10, + 0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd, + 0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015, + 0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e, + 0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee, + 0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000, + 0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72, + 0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39, + 0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e, + 0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91, + 0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a, + 0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17, + 0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9, + 0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60, + 0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e, + 0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1, + 0xcad731dc, 0x10426385, 0x40139722, 0x2084c611, + 0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1, + 0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3, + 0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964, + 0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390, + 0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b, + 0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf, + 0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46, + 0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af, + 0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512, + 0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb, + 0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a, + 0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8, + 0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c, + 0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266, + 0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8, + 0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6, + 0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604, + 0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551, + 0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41, + 0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647, + 0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c, + 0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1, + 0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737, + 0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db, + 0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340, + 0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95, + 0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1, + 0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857, +}; + +#endif /* HAVE_AES_DECRYPT */ +#if defined(HAVE_AES_DECRYPT) || defined(HAVE_AES_CBC) || \ + defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) +static const word32 L_AES_ARM64_te[] = { + 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, + 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, + 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, + 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, + 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, + 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, + 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, + 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, + 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, + 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, + 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, + 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, + 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, + 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, + 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, + 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, + 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, + 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, + 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, + 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, + 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, + 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, + 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, + 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, + 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, + 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, + 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, + 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, + 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, + 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, + 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, + 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, + 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, + 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, + 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, + 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, + 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, + 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, + 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, + 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, + 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, + 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, + 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, + 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, + 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, + 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, + 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, + 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, + 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, + 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, + 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, + 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, + 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, + 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, + 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, + 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, + 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, + 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, + 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, + 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, + 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, + 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, + 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, + 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, +}; + +#endif /* HAVE_AES_DECRYPT || HAVE_AES_CBC || HAVE_AESCCM || HAVE_AESGCM || + * WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +void AES_invert_key(unsigned char* ks, word32 rounds); +void AES_invert_key(unsigned char* ks, word32 rounds) +{ + const word32* te = L_AES_ARM64_te; + const word32* td = L_AES_ARM64_td; + __asm__ __volatile__ ( + "add x12, %x[ks], %x[rounds], lsl 4\n\t" + "mov w13, %w[rounds]\n\t" + "\n" + "L_AES_invert_key_loop_%=: \n\t" + "ldp w4, w5, [%x[ks]]\n\t" + "ldnp w6, w7, [%x[ks], #8]\n\t" + "ldp w8, w9, [x12]\n\t" + "ldnp w10, w11, [x12, #8]\n\t" + "stp w4, w5, [x12]\n\t" + "stnp w6, w7, [x12, #8]\n\t" + "stp w8, w9, [%x[ks]], #8\n\t" + "stp w10, w11, [%x[ks]], #8\n\t" + "subs w13, w13, #2\n\t" + "sub x12, x12, #16\n\t" + "b.ne L_AES_invert_key_loop_%=\n\t" + "sub %x[ks], %x[ks], %x[rounds], lsl 3\n\t" + "add %x[ks], %x[ks], #16\n\t" + "sub w13, %w[rounds], #1\n\t" + "\n" + "L_AES_invert_key_mix_loop_%=: \n\t" + "ldp w4, w5, [%x[ks]]\n\t" + "ldnp w6, w7, [%x[ks], #8]\n\t" + "ubfx w8, w4, #0, #8\n\t" + "ubfx w9, w4, #8, #8\n\t" + "ubfx w10, w4, #16, #8\n\t" + "ubfx w11, w4, #24, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "eor w10, w10, w8, ror 16\n\t" + "eor w10, w10, w9, ror 8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "str w10, [%x[ks]], #4\n\t" + "ubfx w8, w5, #0, #8\n\t" + "ubfx w9, w5, #8, #8\n\t" + "ubfx w10, w5, #16, #8\n\t" + "ubfx w11, w5, #24, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "eor w10, w10, w8, ror 16\n\t" + "eor w10, w10, w9, ror 8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "str w10, [%x[ks]], #4\n\t" + "ubfx w8, w6, #0, #8\n\t" + "ubfx w9, w6, #8, #8\n\t" + "ubfx w10, w6, #16, #8\n\t" + "ubfx w11, w6, #24, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "eor w10, w10, w8, ror 16\n\t" + "eor w10, w10, w9, ror 8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "str w10, [%x[ks]], #4\n\t" + "ubfx w8, w7, #0, #8\n\t" + "ubfx w9, w7, #8, #8\n\t" + "ubfx w10, w7, #16, #8\n\t" + "ubfx w11, w7, #24, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "eor w10, w10, w8, ror 16\n\t" + "eor w10, w10, w9, ror 8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "str w10, [%x[ks]], #4\n\t" + "subs w13, w13, #1\n\t" + "b.ne L_AES_invert_key_mix_loop_%=\n\t" + : [ks] "+r" (ks), [rounds] "+r" (rounds) + : [te] "r" (te), [td] "r" (td) + : "memory", "cc", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", + "x12", "x13" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +static const word32 L_AES_ARM64_rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1b000000, 0x36000000, +}; + +void AES_set_encrypt_key(const unsigned char* key, word32 len, + unsigned char* ks); +void AES_set_encrypt_key(const unsigned char* key, word32 len, + unsigned char* ks) +{ + const word32* rcon = L_AES_ARM64_rcon; + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "cmp %w[len], #0x80\n\t" + "b.eq L_AES_set_encrypt_key_start_128_%=\n\t" + "cmp %w[len], #0xc0\n\t" + "b.eq L_AES_set_encrypt_key_start_192_%=\n\t" + "ldr w6, [%x[key]]\n\t" + "ldr w7, [%x[key], #4]\n\t" + "ldr w8, [%x[key], #8]\n\t" + "ldr w9, [%x[key], #12]\n\t" + "rev w6, w6\n\t" + "rev w7, w7\n\t" + "rev w8, w8\n\t" + "rev w9, w9\n\t" + "stp w6, w7, [%x[ks]], #8\n\t" + "stp w8, w9, [%x[ks]], #8\n\t" + "ldr w6, [%x[key], #16]\n\t" + "ldr w7, [%x[key], #20]\n\t" + "ldr w8, [%x[key], #24]\n\t" + "ldr w9, [%x[key], #28]\n\t" + "rev w6, w6\n\t" + "rev w7, w7\n\t" + "rev w8, w8\n\t" + "rev w9, w9\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "sub %x[ks], %x[ks], #16\n\t" + "mov x4, #6\n\t" + "\n" + "L_AES_set_encrypt_key_loop_256_%=: \n\t" + "ubfx w6, w9, #0, #8\n\t" + "ubfx w7, w9, #8, #8\n\t" + "ubfx w8, w9, #16, #8\n\t" + "ubfx w9, w9, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "eor w3, w9, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "ldr w3, [%[rcon]], #4\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "add %x[ks], %x[ks], #16\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "sub %x[ks], %x[ks], #16\n\t" + "mov w3, w9\n\t" + "ubfx w6, w3, #8, #8\n\t" + "ubfx w7, w3, #16, #8\n\t" + "ubfx w8, w3, #24, #8\n\t" + "ubfx w3, w3, #0, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w3, w3, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w3, [%[te], x3, LSL 0]\n\t" + "eor w3, w3, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "add %x[ks], %x[ks], #16\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "sub %x[ks], %x[ks], #16\n\t" + "subs x4, x4, #1\n\t" + "b.ne L_AES_set_encrypt_key_loop_256_%=\n\t" + "ubfx w6, w9, #0, #8\n\t" + "ubfx w7, w9, #8, #8\n\t" + "ubfx w8, w9, #16, #8\n\t" + "ubfx w9, w9, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "eor w3, w9, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "ldr w3, [%[rcon]], #4\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "add %x[ks], %x[ks], #16\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "sub %x[ks], %x[ks], #16\n\t" + "b L_AES_set_encrypt_key_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_start_192_%=: \n\t" + "ldr w6, [%x[key]]\n\t" + "ldr w7, [%x[key], #4]\n\t" + "ldr w8, [%x[key], #8]\n\t" + "ldr w9, [%x[key], #12]\n\t" + "ldr w10, [%x[key], #16]\n\t" + "ldr w11, [%x[key], #20]\n\t" + "rev w6, w6\n\t" + "rev w7, w7\n\t" + "rev w8, w8\n\t" + "rev w9, w9\n\t" + "rev w10, w10\n\t" + "rev w11, w11\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "stnp w10, w11, [%x[ks], #16]\n\t" + "mov x4, #7\n\t" + "\n" + "L_AES_set_encrypt_key_loop_192_%=: \n\t" + "ubfx w6, w11, #0, #8\n\t" + "ubfx w7, w11, #8, #8\n\t" + "ubfx w8, w11, #16, #8\n\t" + "ubfx w11, w11, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "eor w3, w11, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "ldp w10, w11, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "ldr w3, [%[rcon]], #4\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "eor w10, w10, w9\n\t" + "eor w11, w11, w10\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "stnp w10, w11, [%x[ks], #16]\n\t" + "subs x4, x4, #1\n\t" + "b.ne L_AES_set_encrypt_key_loop_192_%=\n\t" + "ubfx w6, w11, #0, #8\n\t" + "ubfx w7, w11, #8, #8\n\t" + "ubfx w8, w11, #16, #8\n\t" + "ubfx w11, w11, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w11, w11, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "eor w3, w11, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "ldp w10, w11, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "ldr w3, [%[rcon]], #4\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "b L_AES_set_encrypt_key_end_%=\n\t" + "\n" + "L_AES_set_encrypt_key_start_128_%=: \n\t" + "ldr w6, [%x[key]]\n\t" + "ldr w7, [%x[key], #4]\n\t" + "ldr w8, [%x[key], #8]\n\t" + "ldr w9, [%x[key], #12]\n\t" + "rev w6, w6\n\t" + "rev w7, w7\n\t" + "rev w8, w8\n\t" + "rev w9, w9\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "mov x4, #10\n\t" + "\n" + "L_AES_set_encrypt_key_loop_128_%=: \n\t" + "ubfx w6, w9, #0, #8\n\t" + "ubfx w7, w9, #8, #8\n\t" + "ubfx w8, w9, #16, #8\n\t" + "ubfx w9, w9, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w7, w7, #2\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "eor w3, w9, w6, lsl 8\n\t" + "eor w3, w3, w7, lsl 16\n\t" + "eor w3, w3, w8, lsl 24\n\t" + "ldp w6, w7, [%x[ks]], #8\n\t" + "ldp w8, w9, [%x[ks]], #8\n\t" + "eor w6, w6, w3\n\t" + "ldr w3, [%[rcon]], #4\n\t" + "eor w6, w6, w3\n\t" + "eor w7, w7, w6\n\t" + "eor w8, w8, w7\n\t" + "eor w9, w9, w8\n\t" + "stp w6, w7, [%x[ks]]\n\t" + "stnp w8, w9, [%x[ks], #8]\n\t" + "subs x4, x4, #1\n\t" + "b.ne L_AES_set_encrypt_key_loop_128_%=\n\t" + "\n" + "L_AES_set_encrypt_key_end_%=: \n\t" + : [len] "+r" (len), [ks] "+r" (ks) + : [key] "r" (key), [rcon] "r" (rcon), [te] "r" (te) + : "memory", "cc", "x3", "x4", "x6", "x7", "x8", "x9", "x10", "x11" + ); +} + +#if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \ + defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_ECB) +void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr) +{ + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "\n" + "L_AES_ECB_encrypt_loop_block_128_%=: \n\t" + "mov x17, %x[ks]\n\t" + "ldr x6, [%x[in]]\n\t" + "ldr x7, [%x[in], #8]\n\t" + "rev32 x6, x6\n\t" + "rev32 x7, x7\n\t" + "ldp x10, x11, [x17], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "sub w16, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_encrypt_loop_nr_%=: \n\t" + "ubfx x10, x6, #48, #8\n\t" + "ubfx x13, x6, #24, #8\n\t" + "ubfx x14, x7, #8, #8\n\t" + "ubfx x15, x7, #32, #8\n\t" + "ldr x8, [%[te]]\n\t" + "ldr x8, [%[te], #64]\n\t" + "ldr x8, [%[te], #128]\n\t" + "ldr x8, [%[te], #192]\n\t" + "ldr x8, [%[te], #256]\n\t" + "ldr x8, [%[te], #320]\n\t" + "ldr x8, [%[te], #384]\n\t" + "ldr x8, [%[te], #448]\n\t" + "ldr x8, [%[te], #512]\n\t" + "ldr x8, [%[te], #576]\n\t" + "ldr x8, [%[te], #640]\n\t" + "ldr x8, [%[te], #704]\n\t" + "ldr x8, [%[te], #768]\n\t" + "ldr x8, [%[te], #832]\n\t" + "ldr x8, [%[te], #896]\n\t" + "ldr x8, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x11, x7, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x6, #56, #8\n\t" + "eor w10, w10, w14, ror 8\n\t" + "ubfx x14, x7, #40, #8\n\t" + "eor w10, w10, w15, ror 16\n\t" + "ubfx x15, x6, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x7, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x7, #24, #8\n\t" + "eor w11, w11, w14, ror 8\n\t" + "ubfx x14, x6, #8, #8\n\t" + "eor w11, w11, w15, ror 16\n\t" + "ubfx x15, x6, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x7, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x6, #16, #8\n\t" + "eor w12, w12, w14, ror 8\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w12, w15, ror 16\n\t" + "ubfx x15, x6, #40, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w8, ror 24\n\t" + "ldp x6, x7, [x17], #16\n\t" + "eor w13, w13, w14, ror 24\n\t" + "eor w13, w13, w15, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x6\n\t" + "eor x11, x11, x7\n\t" + "ubfx x6, x10, #48, #8\n\t" + "ubfx x9, x10, #24, #8\n\t" + "ubfx x14, x11, #8, #8\n\t" + "ubfx x15, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w6, [%[te], x6, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x7, x11, #16, #8\n\t" + "eor w6, w6, w9, ror 24\n\t" + "ubfx x9, x10, #56, #8\n\t" + "eor w6, w6, w14, ror 8\n\t" + "ubfx x14, x11, #40, #8\n\t" + "eor w6, w6, w15, ror 16\n\t" + "ubfx x15, x10, #0, #8\n\t" + "ldr w7, [%[te], x7, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x11, #48, #8\n\t" + "eor w7, w7, w9, ror 24\n\t" + "ubfx x9, x11, #24, #8\n\t" + "eor w7, w7, w14, ror 8\n\t" + "ubfx x14, x10, #8, #8\n\t" + "eor w7, w7, w15, ror 16\n\t" + "ubfx x15, x10, #32, #8\n\t" + "bfi x6, x7, #32, #32\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w8, w8, w9, ror 24\n\t" + "ubfx x9, x10, #16, #8\n\t" + "eor w8, w8, w14, ror 8\n\t" + "ubfx x14, x11, #56, #8\n\t" + "eor w7, w8, w15, ror 16\n\t" + "ubfx x15, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w12, ror 24\n\t" + "ldp x10, x11, [x17], #16\n\t" + "eor w9, w9, w14, ror 24\n\t" + "eor w9, w9, w15, ror 8\n\t" + "bfi x7, x9, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "subs w16, w16, #2\n\t" + "b.ne L_AES_ECB_encrypt_loop_nr_%=\n\t" + "ubfx x10, x6, #48, #8\n\t" + "ubfx x13, x6, #24, #8\n\t" + "ubfx x14, x7, #8, #8\n\t" + "ubfx x15, x7, #32, #8\n\t" + "ldr x8, [%[te]]\n\t" + "ldr x8, [%[te], #64]\n\t" + "ldr x8, [%[te], #128]\n\t" + "ldr x8, [%[te], #192]\n\t" + "ldr x8, [%[te], #256]\n\t" + "ldr x8, [%[te], #320]\n\t" + "ldr x8, [%[te], #384]\n\t" + "ldr x8, [%[te], #448]\n\t" + "ldr x8, [%[te], #512]\n\t" + "ldr x8, [%[te], #576]\n\t" + "ldr x8, [%[te], #640]\n\t" + "ldr x8, [%[te], #704]\n\t" + "ldr x8, [%[te], #768]\n\t" + "ldr x8, [%[te], #832]\n\t" + "ldr x8, [%[te], #896]\n\t" + "ldr x8, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x11, x7, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x6, #56, #8\n\t" + "eor w10, w10, w14, ror 8\n\t" + "ubfx x14, x7, #40, #8\n\t" + "eor w10, w10, w15, ror 16\n\t" + "ubfx x15, x6, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x7, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x7, #24, #8\n\t" + "eor w11, w11, w14, ror 8\n\t" + "ubfx x14, x6, #8, #8\n\t" + "eor w11, w11, w15, ror 16\n\t" + "ubfx x15, x6, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x7, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x6, #16, #8\n\t" + "eor w12, w12, w14, ror 8\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w12, w15, ror 16\n\t" + "ubfx x15, x6, #40, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w8, ror 24\n\t" + "ldp x6, x7, [x17], #16\n\t" + "eor w13, w13, w14, ror 24\n\t" + "eor w13, w13, w15, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x6\n\t" + "eor x11, x11, x7\n\t" + "ubfx x6, x11, #32, #8\n\t" + "ubfx x9, x11, #8, #8\n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x15, x10, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldr x13, [%[te]]\n\t" + "ldr x13, [%[te], #64]\n\t" + "ldr x13, [%[te], #128]\n\t" + "ldr x13, [%[te], #192]\n\t" + "ldr x13, [%[te], #256]\n\t" + "ldr x13, [%[te], #320]\n\t" + "ldr x13, [%[te], #384]\n\t" + "ldr x13, [%[te], #448]\n\t" + "ldr x13, [%[te], #512]\n\t" + "ldr x13, [%[te], #576]\n\t" + "ldr x13, [%[te], #640]\n\t" + "ldr x13, [%[te], #704]\n\t" + "ldr x13, [%[te], #768]\n\t" + "ldr x13, [%[te], #832]\n\t" + "ldr x13, [%[te], #896]\n\t" + "ldr x13, [%[te], #960]\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x7, x10, #0, #8\n\t" + "eor w6, w6, w9, lsl 8\n\t" + "ubfx x9, x11, #40, #8\n\t" + "eor w6, w6, w14, lsl 16\n\t" + "ubfx x14, x11, #16, #8\n\t" + "eor w6, w6, w15, lsl 24\n\t" + "ubfx x15, x10, #56, #8\n\t" + "lsl w7, w7, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x8, x10, #32, #8\n\t" + "eor w7, w7, w9, lsl 8\n\t" + "ubfx x9, x10, #8, #8\n\t" + "eor w7, w7, w14, lsl 16\n\t" + "ubfx x14, x11, #48, #8\n\t" + "eor w7, w7, w15, lsl 24\n\t" + "ubfx x15, x11, #24, #8\n\t" + "bfi x6, x7, #32, #32\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x13, x11, #56, #8\n\t" + "eor w8, w8, w9, lsl 8\n\t" + "ubfx x9, x11, #0, #8\n\t" + "eor w8, w8, w14, lsl 16\n\t" + "ubfx x14, x10, #40, #8\n\t" + "eor w7, w8, w15, lsl 24\n\t" + "ubfx x15, x10, #16, #8\n\t" + "lsl w13, w13, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "eor w14, w14, w13, lsl 16\n\t" + "ldp x10, x11, [x17]\n\t" + "eor w9, w9, w14, lsl 8\n\t" + "eor w9, w9, w15, lsl 16\n\t" + "bfi x7, x9, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "rev32 x6, x6\n\t" + "rev32 x7, x7\n\t" + "str x6, [%x[out]]\n\t" + "str x7, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_ECB_encrypt_loop_block_128_%=\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te) + : "memory", "cc", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", + "x14", "x15", "x16", "x17" + ); +} + +#endif /* HAVE_AESCCM || HAVE_AESGCM || WOLFSSL_AES_DIRECT || + * WOLFSSL_AES_COUNTER || HAVE_AES_ECB */ +#ifdef HAVE_AES_CBC +void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +{ + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "ldp x7, x8, [%x[iv]]\n\t" + "\n" + "L_AES_CBC_encrypt_loop_block_%=: \n\t" + "mov x19, %x[ks]\n\t" + "ldr x11, [%x[in]]\n\t" + "ldr x12, [%x[in], #8]\n\t" + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "ldp x11, x12, [x19], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "sub w17, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_encrypt_loop_nr_%=: \n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x15, x8, #8, #8\n\t" + "ubfx x16, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w15, ror 8\n\t" + "ubfx x15, x8, #40, #8\n\t" + "eor w11, w11, w16, ror 16\n\t" + "ubfx x16, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w15, ror 8\n\t" + "ubfx x15, x7, #8, #8\n\t" + "eor w12, w12, w16, ror 16\n\t" + "ubfx x16, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w15, ror 8\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w13, w16, ror 16\n\t" + "ubfx x16, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "eor w15, w15, w9, ror 24\n\t" + "ldp x7, x8, [x19], #16\n\t" + "eor w14, w14, w15, ror 24\n\t" + "eor w14, w14, w16, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x11, #48, #8\n\t" + "ubfx x10, x11, #24, #8\n\t" + "ubfx x15, x12, #8, #8\n\t" + "ubfx x16, x12, #32, #8\n\t" + "ldr x13, [%[te]]\n\t" + "ldr x13, [%[te], #64]\n\t" + "ldr x13, [%[te], #128]\n\t" + "ldr x13, [%[te], #192]\n\t" + "ldr x13, [%[te], #256]\n\t" + "ldr x13, [%[te], #320]\n\t" + "ldr x13, [%[te], #384]\n\t" + "ldr x13, [%[te], #448]\n\t" + "ldr x13, [%[te], #512]\n\t" + "ldr x13, [%[te], #576]\n\t" + "ldr x13, [%[te], #640]\n\t" + "ldr x13, [%[te], #704]\n\t" + "ldr x13, [%[te], #768]\n\t" + "ldr x13, [%[te], #832]\n\t" + "ldr x13, [%[te], #896]\n\t" + "ldr x13, [%[te], #960]\n\t" + "ldr w7, [%[te], x7, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x8, x12, #16, #8\n\t" + "eor w7, w7, w10, ror 24\n\t" + "ubfx x10, x11, #56, #8\n\t" + "eor w7, w7, w15, ror 8\n\t" + "ubfx x15, x12, #40, #8\n\t" + "eor w7, w7, w16, ror 16\n\t" + "ubfx x16, x11, #0, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x9, x12, #48, #8\n\t" + "eor w8, w8, w10, ror 24\n\t" + "ubfx x10, x12, #24, #8\n\t" + "eor w8, w8, w15, ror 8\n\t" + "ubfx x15, x11, #8, #8\n\t" + "eor w8, w8, w16, ror 16\n\t" + "ubfx x16, x11, #32, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x13, x12, #0, #8\n\t" + "eor w9, w9, w10, ror 24\n\t" + "ubfx x10, x11, #16, #8\n\t" + "eor w9, w9, w15, ror 8\n\t" + "ubfx x15, x12, #56, #8\n\t" + "eor w8, w9, w16, ror 16\n\t" + "ubfx x16, x11, #40, #8\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "eor w15, w15, w13, ror 24\n\t" + "ldp x11, x12, [x19], #16\n\t" + "eor w10, w10, w15, ror 24\n\t" + "eor w10, w10, w16, ror 8\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "subs w17, w17, #2\n\t" + "b.ne L_AES_CBC_encrypt_loop_nr_%=\n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x15, x8, #8, #8\n\t" + "ubfx x16, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w15, ror 8\n\t" + "ubfx x15, x8, #40, #8\n\t" + "eor w11, w11, w16, ror 16\n\t" + "ubfx x16, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w15, ror 8\n\t" + "ubfx x15, x7, #8, #8\n\t" + "eor w12, w12, w16, ror 16\n\t" + "ubfx x16, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w15, ror 8\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w13, w16, ror 16\n\t" + "ubfx x16, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "eor w15, w15, w9, ror 24\n\t" + "ldp x7, x8, [x19], #16\n\t" + "eor w14, w14, w15, ror 24\n\t" + "eor w14, w14, w16, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x12, #32, #8\n\t" + "ubfx x10, x12, #8, #8\n\t" + "ubfx x15, x11, #48, #8\n\t" + "ubfx x16, x11, #24, #8\n\t" + "lsl w7, w7, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w15, w15, #2\n\t" + "lsl w16, w16, #2\n\t" + "ldr x14, [%[te]]\n\t" + "ldr x14, [%[te], #64]\n\t" + "ldr x14, [%[te], #128]\n\t" + "ldr x14, [%[te], #192]\n\t" + "ldr x14, [%[te], #256]\n\t" + "ldr x14, [%[te], #320]\n\t" + "ldr x14, [%[te], #384]\n\t" + "ldr x14, [%[te], #448]\n\t" + "ldr x14, [%[te], #512]\n\t" + "ldr x14, [%[te], #576]\n\t" + "ldr x14, [%[te], #640]\n\t" + "ldr x14, [%[te], #704]\n\t" + "ldr x14, [%[te], #768]\n\t" + "ldr x14, [%[te], #832]\n\t" + "ldr x14, [%[te], #896]\n\t" + "ldr x14, [%[te], #960]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ldrb w16, [%[te], x16, LSL 0]\n\t" + "ubfx x8, x11, #0, #8\n\t" + "eor w7, w7, w10, lsl 8\n\t" + "ubfx x10, x12, #40, #8\n\t" + "eor w7, w7, w15, lsl 16\n\t" + "ubfx x15, x12, #16, #8\n\t" + "eor w7, w7, w16, lsl 24\n\t" + "ubfx x16, x11, #56, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w15, w15, #2\n\t" + "lsl w16, w16, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ldrb w16, [%[te], x16, LSL 0]\n\t" + "ubfx x9, x11, #32, #8\n\t" + "eor w8, w8, w10, lsl 8\n\t" + "ubfx x10, x11, #8, #8\n\t" + "eor w8, w8, w15, lsl 16\n\t" + "ubfx x15, x12, #48, #8\n\t" + "eor w8, w8, w16, lsl 24\n\t" + "ubfx x16, x12, #24, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w15, w15, #2\n\t" + "lsl w16, w16, #2\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ldrb w16, [%[te], x16, LSL 0]\n\t" + "ubfx x14, x12, #56, #8\n\t" + "eor w9, w9, w10, lsl 8\n\t" + "ubfx x10, x12, #0, #8\n\t" + "eor w9, w9, w15, lsl 16\n\t" + "ubfx x15, x11, #40, #8\n\t" + "eor w8, w9, w16, lsl 24\n\t" + "ubfx x16, x11, #16, #8\n\t" + "lsl w14, w14, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w15, w15, #2\n\t" + "lsl w16, w16, #2\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ldrb w16, [%[te], x16, LSL 0]\n\t" + "eor w15, w15, w14, lsl 16\n\t" + "ldp x11, x12, [x19]\n\t" + "eor w10, w10, w15, lsl 8\n\t" + "eor w10, w10, w16, lsl 16\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "str x7, [%x[out]]\n\t" + "str x8, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_CBC_encrypt_loop_block_%=\n\t" + "stp x7, x8, [%x[iv]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te) + : "memory", "cc", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x19" + ); +} + +#endif /* HAVE_AES_CBC */ +#ifdef WOLFSSL_AES_COUNTER +void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +{ + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "ldp x15, x16, [%x[ctr]]\n\t" + "rev32 x15, x15\n\t" + "rev32 x16, x16\n\t" + "\n" + "L_AES_CTR_encrypt_loop_block_128_%=: \n\t" + "mov x21, %x[ks]\n\t" + "ldp x11, x12, [x21], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x7, x15, x11\n\t" + "eor x8, x16, x12\n\t" + "sub w20, %w[nr], #2\n\t" + "\n" + "L_AES_CTR_encrypt_loop_nr_%=: \n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x17, x8, #8, #8\n\t" + "ubfx x19, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w17, ror 8\n\t" + "ubfx x17, x8, #40, #8\n\t" + "eor w11, w11, w19, ror 16\n\t" + "ubfx x19, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w17, ror 8\n\t" + "ubfx x17, x7, #8, #8\n\t" + "eor w12, w12, w19, ror 16\n\t" + "ubfx x19, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w17, ror 8\n\t" + "ubfx x17, x8, #56, #8\n\t" + "eor w12, w13, w19, ror 16\n\t" + "ubfx x19, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "eor w17, w17, w9, ror 24\n\t" + "ldp x7, x8, [x21], #16\n\t" + "eor w14, w14, w17, ror 24\n\t" + "eor w14, w14, w19, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x11, #48, #8\n\t" + "ubfx x10, x11, #24, #8\n\t" + "ubfx x17, x12, #8, #8\n\t" + "ubfx x19, x12, #32, #8\n\t" + "ldr x13, [%[te]]\n\t" + "ldr x13, [%[te], #64]\n\t" + "ldr x13, [%[te], #128]\n\t" + "ldr x13, [%[te], #192]\n\t" + "ldr x13, [%[te], #256]\n\t" + "ldr x13, [%[te], #320]\n\t" + "ldr x13, [%[te], #384]\n\t" + "ldr x13, [%[te], #448]\n\t" + "ldr x13, [%[te], #512]\n\t" + "ldr x13, [%[te], #576]\n\t" + "ldr x13, [%[te], #640]\n\t" + "ldr x13, [%[te], #704]\n\t" + "ldr x13, [%[te], #768]\n\t" + "ldr x13, [%[te], #832]\n\t" + "ldr x13, [%[te], #896]\n\t" + "ldr x13, [%[te], #960]\n\t" + "ldr w7, [%[te], x7, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x8, x12, #16, #8\n\t" + "eor w7, w7, w10, ror 24\n\t" + "ubfx x10, x11, #56, #8\n\t" + "eor w7, w7, w17, ror 8\n\t" + "ubfx x17, x12, #40, #8\n\t" + "eor w7, w7, w19, ror 16\n\t" + "ubfx x19, x11, #0, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x9, x12, #48, #8\n\t" + "eor w8, w8, w10, ror 24\n\t" + "ubfx x10, x12, #24, #8\n\t" + "eor w8, w8, w17, ror 8\n\t" + "ubfx x17, x11, #8, #8\n\t" + "eor w8, w8, w19, ror 16\n\t" + "ubfx x19, x11, #32, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x13, x12, #0, #8\n\t" + "eor w9, w9, w10, ror 24\n\t" + "ubfx x10, x11, #16, #8\n\t" + "eor w9, w9, w17, ror 8\n\t" + "ubfx x17, x12, #56, #8\n\t" + "eor w8, w9, w19, ror 16\n\t" + "ubfx x19, x11, #40, #8\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "eor w17, w17, w13, ror 24\n\t" + "ldp x11, x12, [x21], #16\n\t" + "eor w10, w10, w17, ror 24\n\t" + "eor w10, w10, w19, ror 8\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "subs w20, w20, #2\n\t" + "b.ne L_AES_CTR_encrypt_loop_nr_%=\n\t" + "ubfx x11, x7, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x17, x8, #8, #8\n\t" + "ubfx x19, x8, #32, #8\n\t" + "ldr x9, [%[te]]\n\t" + "ldr x9, [%[te], #64]\n\t" + "ldr x9, [%[te], #128]\n\t" + "ldr x9, [%[te], #192]\n\t" + "ldr x9, [%[te], #256]\n\t" + "ldr x9, [%[te], #320]\n\t" + "ldr x9, [%[te], #384]\n\t" + "ldr x9, [%[te], #448]\n\t" + "ldr x9, [%[te], #512]\n\t" + "ldr x9, [%[te], #576]\n\t" + "ldr x9, [%[te], #640]\n\t" + "ldr x9, [%[te], #704]\n\t" + "ldr x9, [%[te], #768]\n\t" + "ldr x9, [%[te], #832]\n\t" + "ldr x9, [%[te], #896]\n\t" + "ldr x9, [%[te], #960]\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x12, x8, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w17, ror 8\n\t" + "ubfx x17, x8, #40, #8\n\t" + "eor w11, w11, w19, ror 16\n\t" + "ubfx x19, x7, #0, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x13, x8, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w17, ror 8\n\t" + "ubfx x17, x7, #8, #8\n\t" + "eor w12, w12, w19, ror 16\n\t" + "ubfx x19, x7, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ubfx x9, x8, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x7, #16, #8\n\t" + "eor w13, w13, w17, ror 8\n\t" + "ubfx x17, x8, #56, #8\n\t" + "eor w12, w13, w19, ror 16\n\t" + "ubfx x19, x7, #40, #8\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "eor w17, w17, w9, ror 24\n\t" + "ldp x7, x8, [x21], #16\n\t" + "eor w14, w14, w17, ror 24\n\t" + "eor w14, w14, w19, ror 8\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x12, #32, #8\n\t" + "ubfx x10, x12, #8, #8\n\t" + "ubfx x17, x11, #48, #8\n\t" + "ubfx x19, x11, #24, #8\n\t" + "lsl w7, w7, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w17, w17, #2\n\t" + "lsl w19, w19, #2\n\t" + "ldr x14, [%[te]]\n\t" + "ldr x14, [%[te], #64]\n\t" + "ldr x14, [%[te], #128]\n\t" + "ldr x14, [%[te], #192]\n\t" + "ldr x14, [%[te], #256]\n\t" + "ldr x14, [%[te], #320]\n\t" + "ldr x14, [%[te], #384]\n\t" + "ldr x14, [%[te], #448]\n\t" + "ldr x14, [%[te], #512]\n\t" + "ldr x14, [%[te], #576]\n\t" + "ldr x14, [%[te], #640]\n\t" + "ldr x14, [%[te], #704]\n\t" + "ldr x14, [%[te], #768]\n\t" + "ldr x14, [%[te], #832]\n\t" + "ldr x14, [%[te], #896]\n\t" + "ldr x14, [%[te], #960]\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ubfx x8, x11, #0, #8\n\t" + "eor w7, w7, w10, lsl 8\n\t" + "ubfx x10, x12, #40, #8\n\t" + "eor w7, w7, w17, lsl 16\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w7, w7, w19, lsl 24\n\t" + "ubfx x19, x11, #56, #8\n\t" + "lsl w8, w8, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w17, w17, #2\n\t" + "lsl w19, w19, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ubfx x9, x11, #32, #8\n\t" + "eor w8, w8, w10, lsl 8\n\t" + "ubfx x10, x11, #8, #8\n\t" + "eor w8, w8, w17, lsl 16\n\t" + "ubfx x17, x12, #48, #8\n\t" + "eor w8, w8, w19, lsl 24\n\t" + "ubfx x19, x12, #24, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "lsl w9, w9, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w17, w17, #2\n\t" + "lsl w19, w19, #2\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ubfx x14, x12, #56, #8\n\t" + "eor w9, w9, w10, lsl 8\n\t" + "ubfx x10, x12, #0, #8\n\t" + "eor w9, w9, w17, lsl 16\n\t" + "ubfx x17, x11, #40, #8\n\t" + "eor w8, w9, w19, lsl 24\n\t" + "ubfx x19, x11, #16, #8\n\t" + "lsl w14, w14, #2\n\t" + "lsl w10, w10, #2\n\t" + "lsl w17, w17, #2\n\t" + "lsl w19, w19, #2\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "eor w17, w17, w14, lsl 16\n\t" + "ldp x11, x12, [x21]\n\t" + "eor w10, w10, w17, lsl 8\n\t" + "eor w10, w10, w19, lsl 16\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "ldr x11, [%x[in]]\n\t" + "ldr x12, [%x[in], #8]\n\t" + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "str x7, [%x[out]]\n\t" + "str x8, [%x[out], #8]\n\t" + "ror x16, x16, #32\n\t" + "ror x15, x15, #32\n\t" + "adds x16, x16, #1\n\t" + "adc x15, x15, xzr\n\t" + "ror x16, x16, #32\n\t" + "ror x15, x15, #32\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_CTR_encrypt_loop_block_128_%=\n\t" + "rev32 x15, x15\n\t" + "rev32 x16, x16\n\t" + "stp x15, x16, [%x[ctr]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te) + : "memory", "cc", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x19", "x20", "x21" + ); +} + +#endif /* WOLFSSL_AES_COUNTER */ +#ifdef HAVE_AES_DECRYPT +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \ + defined(HAVE_AES_CBC) || defined(HAVE_AES_ECB) +static const word8 L_AES_ARM64_td4[] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +}; + +#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || defined(HAVE_AES_ECB) +void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr) +{ + const word32* td = L_AES_ARM64_td; + const word8* td4 = L_AES_ARM64_td4; + __asm__ __volatile__ ( + "\n" + "L_AES_ECB_decrypt_loop_block_%=: \n\t" + "mov x19, %x[ks]\n\t" + "ldr x7, [%x[in]]\n\t" + "ldr x8, [%x[in], #8]\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "ldp x11, x12, [x19], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "sub w17, %w[nr], #2\n\t" + "\n" + "L_AES_ECB_decrypt_loop_nr_%=: \n\t" + "ubfx x11, x8, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x15, x8, #8, #8\n\t" + "ubfx x16, x7, #32, #8\n\t" + "ldr x9, [%[td]]\n\t" + "ldr x9, [%[td], #64]\n\t" + "ldr x9, [%[td], #128]\n\t" + "ldr x9, [%[td], #192]\n\t" + "ldr x9, [%[td], #256]\n\t" + "ldr x9, [%[td], #320]\n\t" + "ldr x9, [%[td], #384]\n\t" + "ldr x9, [%[td], #448]\n\t" + "ldr x9, [%[td], #512]\n\t" + "ldr x9, [%[td], #576]\n\t" + "ldr x9, [%[td], #640]\n\t" + "ldr x9, [%[td], #704]\n\t" + "ldr x9, [%[td], #768]\n\t" + "ldr x9, [%[td], #832]\n\t" + "ldr x9, [%[td], #896]\n\t" + "ldr x9, [%[td], #960]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x12, x7, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w15, ror 8\n\t" + "ubfx x15, x8, #40, #8\n\t" + "eor w11, w11, w16, ror 16\n\t" + "ubfx x16, x8, #0, #8\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x13, x7, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w15, ror 8\n\t" + "ubfx x15, x7, #8, #8\n\t" + "eor w12, w12, w16, ror 16\n\t" + "ubfx x16, x8, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x9, x7, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x8, #16, #8\n\t" + "eor w13, w13, w15, ror 8\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w13, w16, ror 16\n\t" + "ubfx x16, x7, #40, #8\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "eor w15, w15, w9, ror 24\n\t" + "ldp x7, x8, [x19], #16\n\t" + "eor w14, w14, w16, ror 8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x12, #48, #8\n\t" + "ubfx x10, x11, #24, #8\n\t" + "ubfx x15, x12, #8, #8\n\t" + "ubfx x16, x11, #32, #8\n\t" + "ldr x13, [%[td]]\n\t" + "ldr x13, [%[td], #64]\n\t" + "ldr x13, [%[td], #128]\n\t" + "ldr x13, [%[td], #192]\n\t" + "ldr x13, [%[td], #256]\n\t" + "ldr x13, [%[td], #320]\n\t" + "ldr x13, [%[td], #384]\n\t" + "ldr x13, [%[td], #448]\n\t" + "ldr x13, [%[td], #512]\n\t" + "ldr x13, [%[td], #576]\n\t" + "ldr x13, [%[td], #640]\n\t" + "ldr x13, [%[td], #704]\n\t" + "ldr x13, [%[td], #768]\n\t" + "ldr x13, [%[td], #832]\n\t" + "ldr x13, [%[td], #896]\n\t" + "ldr x13, [%[td], #960]\n\t" + "ldr w7, [%[td], x7, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x8, x11, #16, #8\n\t" + "eor w7, w7, w10, ror 24\n\t" + "ubfx x10, x11, #56, #8\n\t" + "eor w7, w7, w15, ror 8\n\t" + "ubfx x15, x12, #40, #8\n\t" + "eor w7, w7, w16, ror 16\n\t" + "ubfx x16, x12, #0, #8\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x9, x11, #48, #8\n\t" + "eor w8, w8, w10, ror 24\n\t" + "ubfx x10, x12, #24, #8\n\t" + "eor w8, w8, w15, ror 8\n\t" + "ubfx x15, x11, #8, #8\n\t" + "eor w8, w8, w16, ror 16\n\t" + "ubfx x16, x12, #32, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x13, x11, #0, #8\n\t" + "eor w9, w9, w10, ror 24\n\t" + "ubfx x10, x12, #16, #8\n\t" + "eor w9, w9, w15, ror 8\n\t" + "ubfx x15, x12, #56, #8\n\t" + "eor w8, w9, w16, ror 16\n\t" + "ubfx x16, x11, #40, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "eor w15, w15, w13, ror 24\n\t" + "ldp x11, x12, [x19], #16\n\t" + "eor w10, w10, w16, ror 8\n\t" + "eor w10, w10, w15, ror 24\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "subs w17, w17, #2\n\t" + "b.ne L_AES_ECB_decrypt_loop_nr_%=\n\t" + "ubfx x11, x8, #48, #8\n\t" + "ubfx x14, x7, #24, #8\n\t" + "ubfx x15, x8, #8, #8\n\t" + "ubfx x16, x7, #32, #8\n\t" + "ldr x9, [%[td]]\n\t" + "ldr x9, [%[td], #64]\n\t" + "ldr x9, [%[td], #128]\n\t" + "ldr x9, [%[td], #192]\n\t" + "ldr x9, [%[td], #256]\n\t" + "ldr x9, [%[td], #320]\n\t" + "ldr x9, [%[td], #384]\n\t" + "ldr x9, [%[td], #448]\n\t" + "ldr x9, [%[td], #512]\n\t" + "ldr x9, [%[td], #576]\n\t" + "ldr x9, [%[td], #640]\n\t" + "ldr x9, [%[td], #704]\n\t" + "ldr x9, [%[td], #768]\n\t" + "ldr x9, [%[td], #832]\n\t" + "ldr x9, [%[td], #896]\n\t" + "ldr x9, [%[td], #960]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x12, x7, #16, #8\n\t" + "eor w11, w11, w14, ror 24\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w11, w15, ror 8\n\t" + "ubfx x15, x8, #40, #8\n\t" + "eor w11, w11, w16, ror 16\n\t" + "ubfx x16, x8, #0, #8\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x13, x7, #48, #8\n\t" + "eor w12, w12, w14, ror 24\n\t" + "ubfx x14, x8, #24, #8\n\t" + "eor w12, w12, w15, ror 8\n\t" + "ubfx x15, x7, #8, #8\n\t" + "eor w12, w12, w16, ror 16\n\t" + "ubfx x16, x8, #32, #8\n\t" + "bfi x11, x12, #32, #32\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ubfx x9, x7, #0, #8\n\t" + "eor w13, w13, w14, ror 24\n\t" + "ubfx x14, x8, #16, #8\n\t" + "eor w13, w13, w15, ror 8\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w13, w16, ror 16\n\t" + "ubfx x16, x7, #40, #8\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "eor w15, w15, w9, ror 24\n\t" + "ldp x7, x8, [x19], #16\n\t" + "eor w14, w14, w16, ror 8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "bfi x12, x14, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x11, x11, x7\n\t" + "eor x12, x12, x8\n\t" + "ubfx x7, x11, #32, #8\n\t" + "ubfx x10, x12, #8, #8\n\t" + "ubfx x15, x12, #48, #8\n\t" + "ubfx x16, x11, #24, #8\n\t" + "ldr x14, [%[td4]]\n\t" + "ldr x14, [%[td4], #64]\n\t" + "ldr x14, [%[td4], #128]\n\t" + "ldr x14, [%[td4], #192]\n\t" + "ldr x14, [%[td4], #256]\n\t" + "ldr x14, [%[td4], #320]\n\t" + "ldr x14, [%[td4], #384]\n\t" + "ldr x14, [%[td4], #448]\n\t" + "ldr x14, [%[td4], #512]\n\t" + "ldr x14, [%[td4], #576]\n\t" + "ldr x14, [%[td4], #640]\n\t" + "ldr x14, [%[td4], #704]\n\t" + "ldr x14, [%[td4], #768]\n\t" + "ldr x14, [%[td4], #832]\n\t" + "ldr x14, [%[td4], #896]\n\t" + "ldr x14, [%[td4], #960]\n\t" + "ldrb w7, [%[td4], x7, LSL 0]\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ubfx x8, x12, #0, #8\n\t" + "eor w7, w7, w10, lsl 8\n\t" + "ubfx x10, x12, #40, #8\n\t" + "eor w7, w7, w15, lsl 16\n\t" + "ubfx x15, x11, #16, #8\n\t" + "eor w7, w7, w16, lsl 24\n\t" + "ubfx x16, x11, #56, #8\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w8, [%[td4], x8, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ubfx x9, x12, #32, #8\n\t" + "eor w8, w8, w10, lsl 8\n\t" + "ubfx x10, x11, #8, #8\n\t" + "eor w8, w8, w15, lsl 16\n\t" + "ubfx x15, x11, #48, #8\n\t" + "eor w8, w8, w16, lsl 24\n\t" + "ubfx x16, x12, #24, #8\n\t" + "bfi x7, x8, #32, #32\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w9, [%[td4], x9, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ubfx x14, x12, #56, #8\n\t" + "eor w9, w9, w10, lsl 8\n\t" + "ubfx x10, x11, #0, #8\n\t" + "eor w9, w9, w15, lsl 16\n\t" + "ubfx x15, x11, #40, #8\n\t" + "eor w8, w9, w16, lsl 24\n\t" + "ubfx x16, x12, #16, #8\n\t" + "ldrb w14, [%[td4], x14, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "eor w15, w15, w14, lsl 16\n\t" + "ldp x11, x12, [x19]\n\t" + "eor w10, w10, w15, lsl 8\n\t" + "eor w10, w10, w16, lsl 16\n\t" + "bfi x8, x10, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x7, x7, x11\n\t" + "eor x8, x8, x12\n\t" + "rev32 x7, x7\n\t" + "rev32 x8, x8\n\t" + "str x7, [%x[out]]\n\t" + "str x8, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_ECB_decrypt_loop_block_%=\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr) + : [in] "r" (in), [ks] "r" (ks), [td] "r" (td), [td4] "r" (td4) + : "memory", "cc", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", + "x15", "x16", "x17", "x19" + ); +} + +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || defined(HAVE_AES_ECB) */ +#ifdef HAVE_AES_CBC +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv) +{ + const word8* td4 = L_AES_ARM64_td4; + const word32* td = L_AES_ARM64_td; + __asm__ __volatile__ ( + "\n" + "L_AES_CBC_decrypt_loop_block_%=: \n\t" + "mov x20, %x[ks]\n\t" + "ldr x8, [%x[in]]\n\t" + "ldr x9, [%x[in], #8]\n\t" + "stnp x8, x9, [%x[iv], #16]\n\t" + "rev32 x8, x8\n\t" + "rev32 x9, x9\n\t" + "ldp x12, x13, [x20], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "sub w19, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_decrypt_loop_nr_even_%=: \n\t" + "ubfx x12, x9, #48, #8\n\t" + "ubfx x15, x8, #24, #8\n\t" + "ubfx x16, x9, #8, #8\n\t" + "ubfx x17, x8, #32, #8\n\t" + "ldr x10, [%[td]]\n\t" + "ldr x10, [%[td], #64]\n\t" + "ldr x10, [%[td], #128]\n\t" + "ldr x10, [%[td], #192]\n\t" + "ldr x10, [%[td], #256]\n\t" + "ldr x10, [%[td], #320]\n\t" + "ldr x10, [%[td], #384]\n\t" + "ldr x10, [%[td], #448]\n\t" + "ldr x10, [%[td], #512]\n\t" + "ldr x10, [%[td], #576]\n\t" + "ldr x10, [%[td], #640]\n\t" + "ldr x10, [%[td], #704]\n\t" + "ldr x10, [%[td], #768]\n\t" + "ldr x10, [%[td], #832]\n\t" + "ldr x10, [%[td], #896]\n\t" + "ldr x10, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x13, x8, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w12, w16, ror 8\n\t" + "ubfx x16, x9, #40, #8\n\t" + "eor w12, w12, w17, ror 16\n\t" + "ubfx x17, x9, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x8, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x9, #24, #8\n\t" + "eor w13, w13, w16, ror 8\n\t" + "ubfx x16, x8, #8, #8\n\t" + "eor w13, w13, w17, ror 16\n\t" + "ubfx x17, x9, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x8, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x9, #16, #8\n\t" + "eor w14, w14, w16, ror 8\n\t" + "ubfx x16, x9, #56, #8\n\t" + "eor w13, w14, w17, ror 16\n\t" + "ubfx x17, x8, #40, #8\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w10, ror 24\n\t" + "ldp x8, x9, [x20], #16\n\t" + "eor w15, w15, w17, ror 8\n\t" + "eor w15, w15, w16, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x8\n\t" + "eor x13, x13, x9\n\t" + "ubfx x8, x13, #48, #8\n\t" + "ubfx x11, x12, #24, #8\n\t" + "ubfx x16, x13, #8, #8\n\t" + "ubfx x17, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x9, x12, #16, #8\n\t" + "eor w8, w8, w11, ror 24\n\t" + "ubfx x11, x12, #56, #8\n\t" + "eor w8, w8, w16, ror 8\n\t" + "ubfx x16, x13, #40, #8\n\t" + "eor w8, w8, w17, ror 16\n\t" + "ubfx x17, x13, #0, #8\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x12, #48, #8\n\t" + "eor w9, w9, w11, ror 24\n\t" + "ubfx x11, x13, #24, #8\n\t" + "eor w9, w9, w16, ror 8\n\t" + "ubfx x16, x12, #8, #8\n\t" + "eor w9, w9, w17, ror 16\n\t" + "ubfx x17, x13, #32, #8\n\t" + "bfi x8, x9, #32, #32\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "ubfx x11, x13, #16, #8\n\t" + "eor w10, w10, w16, ror 8\n\t" + "ubfx x16, x13, #56, #8\n\t" + "eor w9, w10, w17, ror 16\n\t" + "ubfx x17, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w14, ror 24\n\t" + "ldp x12, x13, [x20], #16\n\t" + "eor w11, w11, w17, ror 8\n\t" + "eor w11, w11, w16, ror 24\n\t" + "bfi x9, x11, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "subs w19, w19, #2\n\t" + "b.ne L_AES_CBC_decrypt_loop_nr_even_%=\n\t" + "ubfx x12, x9, #48, #8\n\t" + "ubfx x15, x8, #24, #8\n\t" + "ubfx x16, x9, #8, #8\n\t" + "ubfx x17, x8, #32, #8\n\t" + "ldr x10, [%[td]]\n\t" + "ldr x10, [%[td], #64]\n\t" + "ldr x10, [%[td], #128]\n\t" + "ldr x10, [%[td], #192]\n\t" + "ldr x10, [%[td], #256]\n\t" + "ldr x10, [%[td], #320]\n\t" + "ldr x10, [%[td], #384]\n\t" + "ldr x10, [%[td], #448]\n\t" + "ldr x10, [%[td], #512]\n\t" + "ldr x10, [%[td], #576]\n\t" + "ldr x10, [%[td], #640]\n\t" + "ldr x10, [%[td], #704]\n\t" + "ldr x10, [%[td], #768]\n\t" + "ldr x10, [%[td], #832]\n\t" + "ldr x10, [%[td], #896]\n\t" + "ldr x10, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x13, x8, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w12, w16, ror 8\n\t" + "ubfx x16, x9, #40, #8\n\t" + "eor w12, w12, w17, ror 16\n\t" + "ubfx x17, x9, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x8, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x9, #24, #8\n\t" + "eor w13, w13, w16, ror 8\n\t" + "ubfx x16, x8, #8, #8\n\t" + "eor w13, w13, w17, ror 16\n\t" + "ubfx x17, x9, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x8, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x9, #16, #8\n\t" + "eor w14, w14, w16, ror 8\n\t" + "ubfx x16, x9, #56, #8\n\t" + "eor w13, w14, w17, ror 16\n\t" + "ubfx x17, x8, #40, #8\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w10, ror 24\n\t" + "ldp x8, x9, [x20], #16\n\t" + "eor w15, w15, w17, ror 8\n\t" + "eor w15, w15, w16, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x8\n\t" + "eor x13, x13, x9\n\t" + "ubfx x8, x12, #32, #8\n\t" + "ubfx x11, x13, #8, #8\n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x17, x12, #24, #8\n\t" + "ldr x15, [%[td4]]\n\t" + "ldr x15, [%[td4], #64]\n\t" + "ldr x15, [%[td4], #128]\n\t" + "ldr x15, [%[td4], #192]\n\t" + "ldr x15, [%[td4], #256]\n\t" + "ldr x15, [%[td4], #320]\n\t" + "ldr x15, [%[td4], #384]\n\t" + "ldr x15, [%[td4], #448]\n\t" + "ldr x15, [%[td4], #512]\n\t" + "ldr x15, [%[td4], #576]\n\t" + "ldr x15, [%[td4], #640]\n\t" + "ldr x15, [%[td4], #704]\n\t" + "ldr x15, [%[td4], #768]\n\t" + "ldr x15, [%[td4], #832]\n\t" + "ldr x15, [%[td4], #896]\n\t" + "ldr x15, [%[td4], #960]\n\t" + "ldrb w8, [%[td4], x8, LSL 0]\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ubfx x9, x13, #0, #8\n\t" + "eor w8, w8, w11, lsl 8\n\t" + "ubfx x11, x13, #40, #8\n\t" + "eor w8, w8, w16, lsl 16\n\t" + "ubfx x16, x12, #16, #8\n\t" + "eor w8, w8, w17, lsl 24\n\t" + "ubfx x17, x12, #56, #8\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ldrb w9, [%[td4], x9, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ubfx x10, x13, #32, #8\n\t" + "eor w9, w9, w11, lsl 8\n\t" + "ubfx x11, x12, #8, #8\n\t" + "eor w9, w9, w16, lsl 16\n\t" + "ubfx x16, x12, #48, #8\n\t" + "eor w9, w9, w17, lsl 24\n\t" + "ubfx x17, x13, #24, #8\n\t" + "bfi x8, x9, #32, #32\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ubfx x15, x13, #56, #8\n\t" + "eor w10, w10, w11, lsl 8\n\t" + "ubfx x11, x12, #0, #8\n\t" + "eor w10, w10, w16, lsl 16\n\t" + "ubfx x16, x12, #40, #8\n\t" + "eor w9, w10, w17, lsl 24\n\t" + "ubfx x17, x13, #16, #8\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "eor w16, w16, w15, lsl 16\n\t" + "ldp x12, x13, [x20]\n\t" + "eor w11, w11, w16, lsl 8\n\t" + "eor w11, w11, w17, lsl 16\n\t" + "bfi x9, x11, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "rev32 x8, x8\n\t" + "rev32 x9, x9\n\t" + "ldp x12, x13, [%x[iv]]\n\t" + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "str x8, [%x[out]]\n\t" + "str x9, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.eq L_AES_CBC_decrypt_end_dec_odd_%=\n\t" + "mov x20, %x[ks]\n\t" + "ldr x8, [%x[in]]\n\t" + "ldr x9, [%x[in], #8]\n\t" + "stp x8, x9, [%x[iv]]\n\t" + "rev32 x8, x8\n\t" + "rev32 x9, x9\n\t" + "ldp x12, x13, [x20], #16\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "sub w19, %w[nr], #2\n\t" + "\n" + "L_AES_CBC_decrypt_loop_nr_odd_%=: \n\t" + "ubfx x12, x9, #48, #8\n\t" + "ubfx x15, x8, #24, #8\n\t" + "ubfx x16, x9, #8, #8\n\t" + "ubfx x17, x8, #32, #8\n\t" + "ldr x10, [%[td]]\n\t" + "ldr x10, [%[td], #64]\n\t" + "ldr x10, [%[td], #128]\n\t" + "ldr x10, [%[td], #192]\n\t" + "ldr x10, [%[td], #256]\n\t" + "ldr x10, [%[td], #320]\n\t" + "ldr x10, [%[td], #384]\n\t" + "ldr x10, [%[td], #448]\n\t" + "ldr x10, [%[td], #512]\n\t" + "ldr x10, [%[td], #576]\n\t" + "ldr x10, [%[td], #640]\n\t" + "ldr x10, [%[td], #704]\n\t" + "ldr x10, [%[td], #768]\n\t" + "ldr x10, [%[td], #832]\n\t" + "ldr x10, [%[td], #896]\n\t" + "ldr x10, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x13, x8, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w12, w16, ror 8\n\t" + "ubfx x16, x9, #40, #8\n\t" + "eor w12, w12, w17, ror 16\n\t" + "ubfx x17, x9, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x8, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x9, #24, #8\n\t" + "eor w13, w13, w16, ror 8\n\t" + "ubfx x16, x8, #8, #8\n\t" + "eor w13, w13, w17, ror 16\n\t" + "ubfx x17, x9, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x8, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x9, #16, #8\n\t" + "eor w14, w14, w16, ror 8\n\t" + "ubfx x16, x9, #56, #8\n\t" + "eor w13, w14, w17, ror 16\n\t" + "ubfx x17, x8, #40, #8\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w10, ror 24\n\t" + "ldp x8, x9, [x20], #16\n\t" + "eor w15, w15, w17, ror 8\n\t" + "eor w15, w15, w16, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x8\n\t" + "eor x13, x13, x9\n\t" + "ubfx x8, x13, #48, #8\n\t" + "ubfx x11, x12, #24, #8\n\t" + "ubfx x16, x13, #8, #8\n\t" + "ubfx x17, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w8, [%[td], x8, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x9, x12, #16, #8\n\t" + "eor w8, w8, w11, ror 24\n\t" + "ubfx x11, x12, #56, #8\n\t" + "eor w8, w8, w16, ror 8\n\t" + "ubfx x16, x13, #40, #8\n\t" + "eor w8, w8, w17, ror 16\n\t" + "ubfx x17, x13, #0, #8\n\t" + "ldr w9, [%[td], x9, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x12, #48, #8\n\t" + "eor w9, w9, w11, ror 24\n\t" + "ubfx x11, x13, #24, #8\n\t" + "eor w9, w9, w16, ror 8\n\t" + "ubfx x16, x12, #8, #8\n\t" + "eor w9, w9, w17, ror 16\n\t" + "ubfx x17, x13, #32, #8\n\t" + "bfi x8, x9, #32, #32\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w10, w10, w11, ror 24\n\t" + "ubfx x11, x13, #16, #8\n\t" + "eor w10, w10, w16, ror 8\n\t" + "ubfx x16, x13, #56, #8\n\t" + "eor w9, w10, w17, ror 16\n\t" + "ubfx x17, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w11, [%[td], x11, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w14, ror 24\n\t" + "ldp x12, x13, [x20], #16\n\t" + "eor w11, w11, w17, ror 8\n\t" + "eor w11, w11, w16, ror 24\n\t" + "bfi x9, x11, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "subs w19, w19, #2\n\t" + "b.ne L_AES_CBC_decrypt_loop_nr_odd_%=\n\t" + "ubfx x12, x9, #48, #8\n\t" + "ubfx x15, x8, #24, #8\n\t" + "ubfx x16, x9, #8, #8\n\t" + "ubfx x17, x8, #32, #8\n\t" + "ldr x10, [%[td]]\n\t" + "ldr x10, [%[td], #64]\n\t" + "ldr x10, [%[td], #128]\n\t" + "ldr x10, [%[td], #192]\n\t" + "ldr x10, [%[td], #256]\n\t" + "ldr x10, [%[td], #320]\n\t" + "ldr x10, [%[td], #384]\n\t" + "ldr x10, [%[td], #448]\n\t" + "ldr x10, [%[td], #512]\n\t" + "ldr x10, [%[td], #576]\n\t" + "ldr x10, [%[td], #640]\n\t" + "ldr x10, [%[td], #704]\n\t" + "ldr x10, [%[td], #768]\n\t" + "ldr x10, [%[td], #832]\n\t" + "ldr x10, [%[td], #896]\n\t" + "ldr x10, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x13, x8, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x8, #56, #8\n\t" + "eor w12, w12, w16, ror 8\n\t" + "ubfx x16, x9, #40, #8\n\t" + "eor w12, w12, w17, ror 16\n\t" + "ubfx x17, x9, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x14, x8, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x9, #24, #8\n\t" + "eor w13, w13, w16, ror 8\n\t" + "ubfx x16, x8, #8, #8\n\t" + "eor w13, w13, w17, ror 16\n\t" + "ubfx x17, x9, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ubfx x10, x8, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x9, #16, #8\n\t" + "eor w14, w14, w16, ror 8\n\t" + "ubfx x16, x9, #56, #8\n\t" + "eor w13, w14, w17, ror 16\n\t" + "ubfx x17, x8, #40, #8\n\t" + "ldr w10, [%[td], x10, LSL 2]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "eor w16, w16, w10, ror 24\n\t" + "ldp x8, x9, [x20], #16\n\t" + "eor w15, w15, w17, ror 8\n\t" + "eor w15, w15, w16, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x8\n\t" + "eor x13, x13, x9\n\t" + "ubfx x8, x12, #32, #8\n\t" + "ubfx x11, x13, #8, #8\n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x17, x12, #24, #8\n\t" + "ldr x15, [%[td4]]\n\t" + "ldr x15, [%[td4], #64]\n\t" + "ldr x15, [%[td4], #128]\n\t" + "ldr x15, [%[td4], #192]\n\t" + "ldr x15, [%[td4], #256]\n\t" + "ldr x15, [%[td4], #320]\n\t" + "ldr x15, [%[td4], #384]\n\t" + "ldr x15, [%[td4], #448]\n\t" + "ldr x15, [%[td4], #512]\n\t" + "ldr x15, [%[td4], #576]\n\t" + "ldr x15, [%[td4], #640]\n\t" + "ldr x15, [%[td4], #704]\n\t" + "ldr x15, [%[td4], #768]\n\t" + "ldr x15, [%[td4], #832]\n\t" + "ldr x15, [%[td4], #896]\n\t" + "ldr x15, [%[td4], #960]\n\t" + "ldrb w8, [%[td4], x8, LSL 0]\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ubfx x9, x13, #0, #8\n\t" + "eor w8, w8, w11, lsl 8\n\t" + "ubfx x11, x13, #40, #8\n\t" + "eor w8, w8, w16, lsl 16\n\t" + "ubfx x16, x12, #16, #8\n\t" + "eor w8, w8, w17, lsl 24\n\t" + "ubfx x17, x12, #56, #8\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ldrb w9, [%[td4], x9, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ubfx x10, x13, #32, #8\n\t" + "eor w9, w9, w11, lsl 8\n\t" + "ubfx x11, x12, #8, #8\n\t" + "eor w9, w9, w16, lsl 16\n\t" + "ubfx x16, x12, #48, #8\n\t" + "eor w9, w9, w17, lsl 24\n\t" + "ubfx x17, x13, #24, #8\n\t" + "bfi x8, x9, #32, #32\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "ldrb w10, [%[td4], x10, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ubfx x15, x13, #56, #8\n\t" + "eor w10, w10, w11, lsl 8\n\t" + "ubfx x11, x12, #0, #8\n\t" + "eor w10, w10, w16, lsl 16\n\t" + "ubfx x16, x12, #40, #8\n\t" + "eor w9, w10, w17, lsl 24\n\t" + "ubfx x17, x13, #16, #8\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w16, [%[td4], x16, LSL 0]\n\t" + "ldrb w11, [%[td4], x11, LSL 0]\n\t" + "ldrb w17, [%[td4], x17, LSL 0]\n\t" + "eor w16, w16, w15, lsl 16\n\t" + "ldp x12, x13, [x20]\n\t" + "eor w11, w11, w16, lsl 8\n\t" + "eor w11, w11, w17, lsl 16\n\t" + "bfi x9, x11, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "rev32 x8, x8\n\t" + "rev32 x9, x9\n\t" + "ldnp x12, x13, [%x[iv], #16]\n\t" + "eor x8, x8, x12\n\t" + "eor x9, x9, x13\n\t" + "str x8, [%x[out]]\n\t" + "str x9, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_CBC_decrypt_loop_block_%=\n\t" + "b L_AES_CBC_decrypt_end_dec_%=\n\t" + "\n" + "L_AES_CBC_decrypt_end_dec_odd_%=: \n\t" + "ldnp x12, x13, [%x[iv], #16]\n\t" + "stp x12, x13, [%x[iv]]\n\t" + "\n" + "L_AES_CBC_decrypt_end_dec_%=: \n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [iv] "+r" (iv) + : [in] "r" (in), [ks] "r" (ks), [td4] "r" (td4), [td] "r" (td) + : "memory", "cc", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x19", "x20" + ); +} + +#endif /* HAVE_AES_CBC */ +#endif /* WOLFSSL_AES_DIRECT || WOLFSSL_AES_COUNTER || HAVE_AES_CBC + * HAVE_AES_ECB */ +#endif /* HAVE_AES_DECRYPT */ +#ifdef HAVE_AESGCM +static const word32 L_GCM_gmult_len_r[] = { + 0x00000000, 0x1c200000, 0x38400000, 0x24600000, + 0x70800000, 0x6ca00000, 0x48c00000, 0x54e00000, + 0xe1000000, 0xfd200000, 0xd9400000, 0xc5600000, + 0x91800000, 0x8da00000, 0xa9c00000, 0xb5e00000, + 0x00000000, 0x01c20000, 0x03840000, 0x02460000, + 0x07080000, 0x06ca0000, 0x048c0000, 0x054e0000, + 0x0e100000, 0x0fd20000, 0x0d940000, 0x0c560000, + 0x09180000, 0x08da0000, 0x0a9c0000, 0x0b5e0000, +}; + +void GCM_gmult_len(unsigned char* x, const unsigned char** m, + const unsigned char* data, unsigned long len); +void GCM_gmult_len(unsigned char* x, const unsigned char** m, + const unsigned char* data, unsigned long len) +{ + const word32* r = L_GCM_gmult_len_r; + __asm__ __volatile__ ( + "\n" + "L_GCM_gmult_len_start_block_%=: \n\t" + "ldp x4, x5, [%x[x]]\n\t" + "ldp x6, x7, [%x[data]]\n\t" + "eor x4, x4, x6\n\t" + "eor x5, x5, x7\n\t" + "ubfx x12, x5, #56, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x8, x9, [x12]\n\t" + "ubfx x12, x5, #60, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #48, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #52, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #40, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #44, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #32, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #36, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #24, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #28, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #16, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #20, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #8, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #12, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x5, #0, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x5, #4, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #56, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #60, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #48, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #52, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #40, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #44, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #32, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #36, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #24, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #28, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #16, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #20, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfx x12, x4, #8, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x12, x4, #12, #4\n\t" + "mov x11, x9\n\t" + "add x12, x12, #16\n\t" + "lsr x9, x9, #8\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 56\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #8\n\t" + "eor x8, x8, x6\n\t" + "sub x12, x12, #0x100\n\t" + "eor x9, x9, x7\n\t" + "ldr x7, [x12, #8]\n\t" + "ubfx w6, w11, #0, #4\n\t" + "eor x11, x11, x7, lsl 4\n\t" + "add w6, w6, #16\n\t" + "ubfx w11, w11, #4, #4\n\t" + "ldr w6, [%[r], x6, LSL 2]\n\t" + "ldr w7, [%[r], x11, LSL 2]\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "eor x8, x8, x7, lsl 32\n\t" + "ubfiz x12, x4, #4, #4\n\t" + "add x12, x12, %x[m]\n\t" + "ldp x6, x7, [x12]\n\t" + "eor x8, x8, x6\n\t" + "eor x9, x9, x7\n\t" + "ubfx x11, x9, #0, #4\n\t" + "ubfx x12, x4, #4, #4\n\t" + "lsr x9, x9, #4\n\t" + "add x12, %x[m], x12, lsl 4\n\t" + "orr x9, x9, x8, lsl 60\n\t" + "ldp x6, x7, [x12]\n\t" + "lsr x8, x8, #4\n\t" + "eor x8, x8, x6\n\t" + "ldr w6, [%[r], x11, LSL 2]\n\t" + "eor x9, x9, x7\n\t" + "eor x8, x8, x6, lsl 32\n\t" + "rev x8, x8\n\t" + "rev x9, x9\n\t" + "stp x8, x9, [%x[x]]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[data], %x[data], #16\n\t" + "b.ne L_GCM_gmult_len_start_block_%=\n\t" + : [x] "+r" (x), [len] "+r" (len) + : [m] "r" (m), [data] "r" (data), [r] "r" (r) + : "memory", "cc", "x4", "x5", "x6", "x7", "x8", "x9", "x11", "x12" + ); +} + +void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr) +{ + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "ldp x16, x17, [%x[ctr]]\n\t" + "rev32 x16, x16\n\t" + "rev32 x17, x17\n\t" + "\n" + "L_AES_GCM_encrypt_loop_block_%=: \n\t" + "mov x21, %x[ks]\n\t" + "lsr x9, x17, #32\n\t" + "ldp x10, x11, [x21], #16\n\t" + "add w9, w9, #1\n\t" + "bfi x17, x9, #32, #32\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x6, x16, x10\n\t" + "eor x7, x17, x11\n\t" + "sub w20, %w[nr], #2\n\t" + "\n" + "L_AES_GCM_encrypt_loop_nr_%=: \n\t" + "ubfx x10, x6, #48, #8\n\t" + "ubfx x13, x6, #24, #8\n\t" + "ubfx x14, x7, #8, #8\n\t" + "ubfx x15, x7, #32, #8\n\t" + "ldr x8, [%[te]]\n\t" + "ldr x8, [%[te], #64]\n\t" + "ldr x8, [%[te], #128]\n\t" + "ldr x8, [%[te], #192]\n\t" + "ldr x8, [%[te], #256]\n\t" + "ldr x8, [%[te], #320]\n\t" + "ldr x8, [%[te], #384]\n\t" + "ldr x8, [%[te], #448]\n\t" + "ldr x8, [%[te], #512]\n\t" + "ldr x8, [%[te], #576]\n\t" + "ldr x8, [%[te], #640]\n\t" + "ldr x8, [%[te], #704]\n\t" + "ldr x8, [%[te], #768]\n\t" + "ldr x8, [%[te], #832]\n\t" + "ldr x8, [%[te], #896]\n\t" + "ldr x8, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x11, x7, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x6, #56, #8\n\t" + "eor w10, w10, w14, ror 8\n\t" + "ubfx x14, x7, #40, #8\n\t" + "eor w10, w10, w15, ror 16\n\t" + "ubfx x15, x6, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x7, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x7, #24, #8\n\t" + "eor w11, w11, w14, ror 8\n\t" + "ubfx x14, x6, #8, #8\n\t" + "eor w11, w11, w15, ror 16\n\t" + "ubfx x15, x6, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x7, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x6, #16, #8\n\t" + "eor w12, w12, w14, ror 8\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w12, w15, ror 16\n\t" + "ubfx x15, x6, #40, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w8, ror 24\n\t" + "ldp x6, x7, [x21], #16\n\t" + "eor w13, w13, w14, ror 24\n\t" + "eor w13, w13, w15, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x6\n\t" + "eor x11, x11, x7\n\t" + "ubfx x6, x10, #48, #8\n\t" + "ubfx x9, x10, #24, #8\n\t" + "ubfx x14, x11, #8, #8\n\t" + "ubfx x15, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w6, [%[te], x6, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x7, x11, #16, #8\n\t" + "eor w6, w6, w9, ror 24\n\t" + "ubfx x9, x10, #56, #8\n\t" + "eor w6, w6, w14, ror 8\n\t" + "ubfx x14, x11, #40, #8\n\t" + "eor w6, w6, w15, ror 16\n\t" + "ubfx x15, x10, #0, #8\n\t" + "ldr w7, [%[te], x7, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x11, #48, #8\n\t" + "eor w7, w7, w9, ror 24\n\t" + "ubfx x9, x11, #24, #8\n\t" + "eor w7, w7, w14, ror 8\n\t" + "ubfx x14, x10, #8, #8\n\t" + "eor w7, w7, w15, ror 16\n\t" + "ubfx x15, x10, #32, #8\n\t" + "bfi x6, x7, #32, #32\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w8, w8, w9, ror 24\n\t" + "ubfx x9, x10, #16, #8\n\t" + "eor w8, w8, w14, ror 8\n\t" + "ubfx x14, x11, #56, #8\n\t" + "eor w7, w8, w15, ror 16\n\t" + "ubfx x15, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w9, [%[te], x9, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w12, ror 24\n\t" + "ldp x10, x11, [x21], #16\n\t" + "eor w9, w9, w14, ror 24\n\t" + "eor w9, w9, w15, ror 8\n\t" + "bfi x7, x9, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "subs w20, w20, #2\n\t" + "b.ne L_AES_GCM_encrypt_loop_nr_%=\n\t" + "ubfx x10, x6, #48, #8\n\t" + "ubfx x13, x6, #24, #8\n\t" + "ubfx x14, x7, #8, #8\n\t" + "ubfx x15, x7, #32, #8\n\t" + "ldr x8, [%[te]]\n\t" + "ldr x8, [%[te], #64]\n\t" + "ldr x8, [%[te], #128]\n\t" + "ldr x8, [%[te], #192]\n\t" + "ldr x8, [%[te], #256]\n\t" + "ldr x8, [%[te], #320]\n\t" + "ldr x8, [%[te], #384]\n\t" + "ldr x8, [%[te], #448]\n\t" + "ldr x8, [%[te], #512]\n\t" + "ldr x8, [%[te], #576]\n\t" + "ldr x8, [%[te], #640]\n\t" + "ldr x8, [%[te], #704]\n\t" + "ldr x8, [%[te], #768]\n\t" + "ldr x8, [%[te], #832]\n\t" + "ldr x8, [%[te], #896]\n\t" + "ldr x8, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x11, x7, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x6, #56, #8\n\t" + "eor w10, w10, w14, ror 8\n\t" + "ubfx x14, x7, #40, #8\n\t" + "eor w10, w10, w15, ror 16\n\t" + "ubfx x15, x6, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x12, x7, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x7, #24, #8\n\t" + "eor w11, w11, w14, ror 8\n\t" + "ubfx x14, x6, #8, #8\n\t" + "eor w11, w11, w15, ror 16\n\t" + "ubfx x15, x6, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ubfx x8, x7, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x6, #16, #8\n\t" + "eor w12, w12, w14, ror 8\n\t" + "ubfx x14, x7, #56, #8\n\t" + "eor w11, w12, w15, ror 16\n\t" + "ubfx x15, x6, #40, #8\n\t" + "ldr w8, [%[te], x8, LSL 2]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "eor w14, w14, w8, ror 24\n\t" + "ldp x6, x7, [x21], #16\n\t" + "eor w13, w13, w14, ror 24\n\t" + "eor w13, w13, w15, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x6\n\t" + "eor x11, x11, x7\n\t" + "ubfx x6, x11, #32, #8\n\t" + "ubfx x9, x11, #8, #8\n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x15, x10, #24, #8\n\t" + "lsl w6, w6, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldr x13, [%[te]]\n\t" + "ldr x13, [%[te], #64]\n\t" + "ldr x13, [%[te], #128]\n\t" + "ldr x13, [%[te], #192]\n\t" + "ldr x13, [%[te], #256]\n\t" + "ldr x13, [%[te], #320]\n\t" + "ldr x13, [%[te], #384]\n\t" + "ldr x13, [%[te], #448]\n\t" + "ldr x13, [%[te], #512]\n\t" + "ldr x13, [%[te], #576]\n\t" + "ldr x13, [%[te], #640]\n\t" + "ldr x13, [%[te], #704]\n\t" + "ldr x13, [%[te], #768]\n\t" + "ldr x13, [%[te], #832]\n\t" + "ldr x13, [%[te], #896]\n\t" + "ldr x13, [%[te], #960]\n\t" + "ldrb w6, [%[te], x6, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x7, x10, #0, #8\n\t" + "eor w6, w6, w9, lsl 8\n\t" + "ubfx x9, x11, #40, #8\n\t" + "eor w6, w6, w14, lsl 16\n\t" + "ubfx x14, x11, #16, #8\n\t" + "eor w6, w6, w15, lsl 24\n\t" + "ubfx x15, x10, #56, #8\n\t" + "lsl w7, w7, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w7, [%[te], x7, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x8, x10, #32, #8\n\t" + "eor w7, w7, w9, lsl 8\n\t" + "ubfx x9, x10, #8, #8\n\t" + "eor w7, w7, w14, lsl 16\n\t" + "ubfx x14, x11, #48, #8\n\t" + "eor w7, w7, w15, lsl 24\n\t" + "ubfx x15, x11, #24, #8\n\t" + "bfi x6, x7, #32, #32\n\t" + "lsl w8, w8, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w8, [%[te], x8, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "ubfx x13, x11, #56, #8\n\t" + "eor w8, w8, w9, lsl 8\n\t" + "ubfx x9, x11, #0, #8\n\t" + "eor w8, w8, w14, lsl 16\n\t" + "ubfx x14, x10, #40, #8\n\t" + "eor w7, w8, w15, lsl 24\n\t" + "ubfx x15, x10, #16, #8\n\t" + "lsl w13, w13, #2\n\t" + "lsl w9, w9, #2\n\t" + "lsl w14, w14, #2\n\t" + "lsl w15, w15, #2\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w9, [%[te], x9, LSL 0]\n\t" + "ldrb w14, [%[te], x14, LSL 0]\n\t" + "ldrb w15, [%[te], x15, LSL 0]\n\t" + "eor w14, w14, w13, lsl 16\n\t" + "ldp x10, x11, [x21]\n\t" + "eor w9, w9, w14, lsl 8\n\t" + "eor w9, w9, w15, lsl 16\n\t" + "bfi x7, x9, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "rev32 x6, x6\n\t" + "rev32 x7, x7\n\t" + "ldr x10, [%x[in]]\n\t" + "ldr x11, [%x[in], #8]\n\t" + "eor x6, x6, x10\n\t" + "eor x7, x7, x11\n\t" + "str x6, [%x[out]]\n\t" + "str x7, [%x[out], #8]\n\t" + "subs %x[len], %x[len], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "b.ne L_AES_GCM_encrypt_loop_block_%=\n\t" + "rev32 x16, x16\n\t" + "rev32 x17, x17\n\t" + "stp x16, x17, [%x[ctr]]\n\t" + : [out] "+r" (out), [len] "+r" (len), [nr] "+r" (nr), [ctr] "+r" (ctr) + : [in] "r" (in), [ks] "r" (ks), [te] "r" (te) + : "memory", "cc", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", + "x14", "x15", "x16", "x17", "x20", "x21" + ); +} + +#endif /* HAVE_AESGCM */ +#ifdef WOLFSSL_AES_XTS +void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, const byte* i, + byte* key, byte* key2, byte* tmp, int nr) +{ + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "mov x9, #0x87\n\t" + "mov x26, %x[key2]\n\t" + "ldp x21, x22, [%x[i]]\n\t" + "ldp x14, x15, [x26], #16\n\t" + "rev32 x21, x21\n\t" + "rev32 x22, x22\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x21, x21, x14\n\t" + "eor x22, x22, x15\n\t" + "sub w25, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_loop_nr_tweak_%=: \n\t" + "ubfx x14, x21, #48, #8\n\t" + "ubfx x17, x21, #24, #8\n\t" + "ubfx x19, x22, #8, #8\n\t" + "ubfx x20, x22, #32, #8\n\t" + "ldr x23, [%[te]]\n\t" + "ldr x23, [%[te], #64]\n\t" + "ldr x23, [%[te], #128]\n\t" + "ldr x23, [%[te], #192]\n\t" + "ldr x23, [%[te], #256]\n\t" + "ldr x23, [%[te], #320]\n\t" + "ldr x23, [%[te], #384]\n\t" + "ldr x23, [%[te], #448]\n\t" + "ldr x23, [%[te], #512]\n\t" + "ldr x23, [%[te], #576]\n\t" + "ldr x23, [%[te], #640]\n\t" + "ldr x23, [%[te], #704]\n\t" + "ldr x23, [%[te], #768]\n\t" + "ldr x23, [%[te], #832]\n\t" + "ldr x23, [%[te], #896]\n\t" + "ldr x23, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x22, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x21, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x22, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x21, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x22, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x22, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x21, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x21, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x23, x22, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x21, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x22, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x21, #40, #8\n\t" + "ldr w23, [%[te], x23, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w23, ror 24\n\t" + "ldp x21, x22, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x21\n\t" + "eor x15, x15, x22\n\t" + "ubfx x21, x14, #48, #8\n\t" + "ubfx x24, x14, #24, #8\n\t" + "ubfx x19, x15, #8, #8\n\t" + "ubfx x20, x15, #32, #8\n\t" + "ldr x16, [%[te]]\n\t" + "ldr x16, [%[te], #64]\n\t" + "ldr x16, [%[te], #128]\n\t" + "ldr x16, [%[te], #192]\n\t" + "ldr x16, [%[te], #256]\n\t" + "ldr x16, [%[te], #320]\n\t" + "ldr x16, [%[te], #384]\n\t" + "ldr x16, [%[te], #448]\n\t" + "ldr x16, [%[te], #512]\n\t" + "ldr x16, [%[te], #576]\n\t" + "ldr x16, [%[te], #640]\n\t" + "ldr x16, [%[te], #704]\n\t" + "ldr x16, [%[te], #768]\n\t" + "ldr x16, [%[te], #832]\n\t" + "ldr x16, [%[te], #896]\n\t" + "ldr x16, [%[te], #960]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w24, [%[te], x24, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x22, x15, #16, #8\n\t" + "eor w21, w21, w24, ror 24\n\t" + "ubfx x24, x14, #56, #8\n\t" + "eor w21, w21, w19, ror 8\n\t" + "ubfx x19, x15, #40, #8\n\t" + "eor w21, w21, w20, ror 16\n\t" + "ubfx x20, x14, #0, #8\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ldr w24, [%[te], x24, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x23, x15, #48, #8\n\t" + "eor w22, w22, w24, ror 24\n\t" + "ubfx x24, x15, #24, #8\n\t" + "eor w22, w22, w19, ror 8\n\t" + "ubfx x19, x14, #8, #8\n\t" + "eor w22, w22, w20, ror 16\n\t" + "ubfx x20, x14, #32, #8\n\t" + "bfi x21, x22, #32, #32\n\t" + "ldr w23, [%[te], x23, LSL 2]\n\t" + "ldr w24, [%[te], x24, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x15, #0, #8\n\t" + "eor w23, w23, w24, ror 24\n\t" + "ubfx x24, x14, #16, #8\n\t" + "eor w23, w23, w19, ror 8\n\t" + "ubfx x19, x15, #56, #8\n\t" + "eor w22, w23, w20, ror 16\n\t" + "ubfx x20, x14, #40, #8\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w24, [%[te], x24, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w16, ror 24\n\t" + "ldp x14, x15, [x26], #16\n\t" + "eor w24, w24, w19, ror 24\n\t" + "eor w24, w24, w20, ror 8\n\t" + "bfi x22, x24, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x21, x21, x14\n\t" + "eor x22, x22, x15\n\t" + "subs w25, w25, #2\n\t" + "b.ne L_AES_XTS_encrypt_loop_nr_tweak_%=\n\t" + "ubfx x14, x21, #48, #8\n\t" + "ubfx x17, x21, #24, #8\n\t" + "ubfx x19, x22, #8, #8\n\t" + "ubfx x20, x22, #32, #8\n\t" + "ldr x23, [%[te]]\n\t" + "ldr x23, [%[te], #64]\n\t" + "ldr x23, [%[te], #128]\n\t" + "ldr x23, [%[te], #192]\n\t" + "ldr x23, [%[te], #256]\n\t" + "ldr x23, [%[te], #320]\n\t" + "ldr x23, [%[te], #384]\n\t" + "ldr x23, [%[te], #448]\n\t" + "ldr x23, [%[te], #512]\n\t" + "ldr x23, [%[te], #576]\n\t" + "ldr x23, [%[te], #640]\n\t" + "ldr x23, [%[te], #704]\n\t" + "ldr x23, [%[te], #768]\n\t" + "ldr x23, [%[te], #832]\n\t" + "ldr x23, [%[te], #896]\n\t" + "ldr x23, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x22, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x21, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x22, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x21, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x22, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x22, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x21, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x21, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x23, x22, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x21, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x22, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x21, #40, #8\n\t" + "ldr w23, [%[te], x23, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w23, ror 24\n\t" + "ldp x21, x22, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x21\n\t" + "eor x15, x15, x22\n\t" + "ubfx x21, x15, #32, #8\n\t" + "ubfx x24, x15, #8, #8\n\t" + "ubfx x19, x14, #48, #8\n\t" + "ubfx x20, x14, #24, #8\n\t" + "lsl w21, w21, #2\n\t" + "lsl w24, w24, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldr x17, [%[te]]\n\t" + "ldr x17, [%[te], #64]\n\t" + "ldr x17, [%[te], #128]\n\t" + "ldr x17, [%[te], #192]\n\t" + "ldr x17, [%[te], #256]\n\t" + "ldr x17, [%[te], #320]\n\t" + "ldr x17, [%[te], #384]\n\t" + "ldr x17, [%[te], #448]\n\t" + "ldr x17, [%[te], #512]\n\t" + "ldr x17, [%[te], #576]\n\t" + "ldr x17, [%[te], #640]\n\t" + "ldr x17, [%[te], #704]\n\t" + "ldr x17, [%[te], #768]\n\t" + "ldr x17, [%[te], #832]\n\t" + "ldr x17, [%[te], #896]\n\t" + "ldr x17, [%[te], #960]\n\t" + "ldrb w21, [%[te], x21, LSL 0]\n\t" + "ldrb w24, [%[te], x24, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x22, x14, #0, #8\n\t" + "eor w21, w21, w24, lsl 8\n\t" + "ubfx x24, x15, #40, #8\n\t" + "eor w21, w21, w19, lsl 16\n\t" + "ubfx x19, x15, #16, #8\n\t" + "eor w21, w21, w20, lsl 24\n\t" + "ubfx x20, x14, #56, #8\n\t" + "lsl w22, w22, #2\n\t" + "lsl w24, w24, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w22, [%[te], x22, LSL 0]\n\t" + "ldrb w24, [%[te], x24, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x23, x14, #32, #8\n\t" + "eor w22, w22, w24, lsl 8\n\t" + "ubfx x24, x14, #8, #8\n\t" + "eor w22, w22, w19, lsl 16\n\t" + "ubfx x19, x15, #48, #8\n\t" + "eor w22, w22, w20, lsl 24\n\t" + "ubfx x20, x15, #24, #8\n\t" + "bfi x21, x22, #32, #32\n\t" + "lsl w23, w23, #2\n\t" + "lsl w24, w24, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w23, [%[te], x23, LSL 0]\n\t" + "ldrb w24, [%[te], x24, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x17, x15, #56, #8\n\t" + "eor w23, w23, w24, lsl 8\n\t" + "ubfx x24, x15, #0, #8\n\t" + "eor w23, w23, w19, lsl 16\n\t" + "ubfx x19, x14, #40, #8\n\t" + "eor w22, w23, w20, lsl 24\n\t" + "ubfx x20, x14, #16, #8\n\t" + "lsl w17, w17, #2\n\t" + "lsl w24, w24, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w24, [%[te], x24, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "eor w19, w19, w17, lsl 16\n\t" + "ldp x14, x15, [x26]\n\t" + "eor w24, w24, w19, lsl 8\n\t" + "eor w24, w24, w20, lsl 16\n\t" + "bfi x22, x24, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x21, x21, x14\n\t" + "eor x22, x22, x15\n\t" + "rev32 x21, x21\n\t" + "rev32 x22, x22\n\t" + "\n" + "L_AES_XTS_encrypt_loop_block_%=: \n\t" + "mov x26, %x[key]\n\t" + "ldp x10, x11, [%x[in]]\n\t" + "ldp x14, x15, [x26], #16\n\t" + "eor x10, x10, x21\n\t" + "eor x11, x11, x22\n\t" + "rev32 x10, x10\n\t" + "rev32 x11, x11\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "sub w25, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_loop_nr_%=: \n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x17, x10, #24, #8\n\t" + "ubfx x19, x11, #8, #8\n\t" + "ubfx x20, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x11, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x10, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x11, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x10, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x11, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x11, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x10, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x10, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x10, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x11, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w12, ror 24\n\t" + "ldp x10, x11, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x10\n\t" + "eor x15, x15, x11\n\t" + "ubfx x10, x14, #48, #8\n\t" + "ubfx x13, x14, #24, #8\n\t" + "ubfx x19, x15, #8, #8\n\t" + "ubfx x20, x15, #32, #8\n\t" + "ldr x16, [%[te]]\n\t" + "ldr x16, [%[te], #64]\n\t" + "ldr x16, [%[te], #128]\n\t" + "ldr x16, [%[te], #192]\n\t" + "ldr x16, [%[te], #256]\n\t" + "ldr x16, [%[te], #320]\n\t" + "ldr x16, [%[te], #384]\n\t" + "ldr x16, [%[te], #448]\n\t" + "ldr x16, [%[te], #512]\n\t" + "ldr x16, [%[te], #576]\n\t" + "ldr x16, [%[te], #640]\n\t" + "ldr x16, [%[te], #704]\n\t" + "ldr x16, [%[te], #768]\n\t" + "ldr x16, [%[te], #832]\n\t" + "ldr x16, [%[te], #896]\n\t" + "ldr x16, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x11, x15, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x14, #56, #8\n\t" + "eor w10, w10, w19, ror 8\n\t" + "ubfx x19, x15, #40, #8\n\t" + "eor w10, w10, w20, ror 16\n\t" + "ubfx x20, x14, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x15, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x15, #24, #8\n\t" + "eor w11, w11, w19, ror 8\n\t" + "ubfx x19, x14, #8, #8\n\t" + "eor w11, w11, w20, ror 16\n\t" + "ubfx x20, x14, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x15, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x14, #16, #8\n\t" + "eor w12, w12, w19, ror 8\n\t" + "ubfx x19, x15, #56, #8\n\t" + "eor w11, w12, w20, ror 16\n\t" + "ubfx x20, x14, #40, #8\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w16, ror 24\n\t" + "ldp x14, x15, [x26], #16\n\t" + "eor w13, w13, w19, ror 24\n\t" + "eor w13, w13, w20, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "subs w25, w25, #2\n\t" + "b.ne L_AES_XTS_encrypt_loop_nr_%=\n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x17, x10, #24, #8\n\t" + "ubfx x19, x11, #8, #8\n\t" + "ubfx x20, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x11, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x10, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x11, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x10, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x11, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x11, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x10, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x10, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x10, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x11, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w12, ror 24\n\t" + "ldp x10, x11, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x10\n\t" + "eor x15, x15, x11\n\t" + "ubfx x10, x15, #32, #8\n\t" + "ubfx x13, x15, #8, #8\n\t" + "ubfx x19, x14, #48, #8\n\t" + "ubfx x20, x14, #24, #8\n\t" + "lsl w10, w10, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldr x17, [%[te]]\n\t" + "ldr x17, [%[te], #64]\n\t" + "ldr x17, [%[te], #128]\n\t" + "ldr x17, [%[te], #192]\n\t" + "ldr x17, [%[te], #256]\n\t" + "ldr x17, [%[te], #320]\n\t" + "ldr x17, [%[te], #384]\n\t" + "ldr x17, [%[te], #448]\n\t" + "ldr x17, [%[te], #512]\n\t" + "ldr x17, [%[te], #576]\n\t" + "ldr x17, [%[te], #640]\n\t" + "ldr x17, [%[te], #704]\n\t" + "ldr x17, [%[te], #768]\n\t" + "ldr x17, [%[te], #832]\n\t" + "ldr x17, [%[te], #896]\n\t" + "ldr x17, [%[te], #960]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x11, x14, #0, #8\n\t" + "eor w10, w10, w13, lsl 8\n\t" + "ubfx x13, x15, #40, #8\n\t" + "eor w10, w10, w19, lsl 16\n\t" + "ubfx x19, x15, #16, #8\n\t" + "eor w10, w10, w20, lsl 24\n\t" + "ubfx x20, x14, #56, #8\n\t" + "lsl w11, w11, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x12, x14, #32, #8\n\t" + "eor w11, w11, w13, lsl 8\n\t" + "ubfx x13, x14, #8, #8\n\t" + "eor w11, w11, w19, lsl 16\n\t" + "ubfx x19, x15, #48, #8\n\t" + "eor w11, w11, w20, lsl 24\n\t" + "ubfx x20, x15, #24, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "lsl w12, w12, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w12, [%[te], x12, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x17, x15, #56, #8\n\t" + "eor w12, w12, w13, lsl 8\n\t" + "ubfx x13, x15, #0, #8\n\t" + "eor w12, w12, w19, lsl 16\n\t" + "ubfx x19, x14, #40, #8\n\t" + "eor w11, w12, w20, lsl 24\n\t" + "ubfx x20, x14, #16, #8\n\t" + "lsl w17, w17, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "eor w19, w19, w17, lsl 16\n\t" + "ldp x14, x15, [x26]\n\t" + "eor w13, w13, w19, lsl 8\n\t" + "eor w13, w13, w20, lsl 16\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "rev32 x10, x10\n\t" + "rev32 x11, x11\n\t" + "eor x10, x10, x21\n\t" + "eor x11, x11, x22\n\t" + "stp x10, x11, [%x[out]]\n\t" + "and x19, x9, x22, asr 63\n\t" + "extr x22, x22, x21, #63\n\t" + "eor x21, x19, x21, lsl 1\n\t" + "sub %w[sz], %w[sz], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "cmp %w[sz], #16\n\t" + "b.ge L_AES_XTS_encrypt_loop_block_%=\n\t" + "cbz %w[sz], L_AES_XTS_encrypt_done_data_%=\n\t" + "mov x26, %x[key]\n\t" + "sub %x[out], %x[out], #16\n\t" + "ldp x10, x11, [%x[out]], #16\n\t" + "stp x10, x11, [%x[tmp]]\n\t" + "mov w14, %w[sz]\n\t" + "\n" + "L_AES_XTS_encrypt_start_byte_%=: \n\t" + "ldrb w19, [%x[tmp]]\n\t" + "ldrb w20, [%x[in]], #1\n\t" + "strb w19, [%x[out]], #1\n\t" + "strb w20, [%x[tmp]], #1\n\t" + "subs w14, w14, #1\n\t" + "b.gt L_AES_XTS_encrypt_start_byte_%=\n\t" + "sub %x[out], %x[out], %x[sz]\n\t" + "sub %x[tmp], %x[tmp], %x[sz]\n\t" + "sub %x[out], %x[out], #16\n\t" + "ldp x10, x11, [%x[tmp]]\n\t" + "ldp x14, x15, [x26], #16\n\t" + "eor x10, x10, x21\n\t" + "eor x11, x11, x22\n\t" + "rev32 x10, x10\n\t" + "rev32 x11, x11\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "sub w25, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_encrypt_loop_nr_partial_%=: \n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x17, x10, #24, #8\n\t" + "ubfx x19, x11, #8, #8\n\t" + "ubfx x20, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x11, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x10, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x11, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x10, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x11, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x11, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x10, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x10, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x10, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x11, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w12, ror 24\n\t" + "ldp x10, x11, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x10\n\t" + "eor x15, x15, x11\n\t" + "ubfx x10, x14, #48, #8\n\t" + "ubfx x13, x14, #24, #8\n\t" + "ubfx x19, x15, #8, #8\n\t" + "ubfx x20, x15, #32, #8\n\t" + "ldr x16, [%[te]]\n\t" + "ldr x16, [%[te], #64]\n\t" + "ldr x16, [%[te], #128]\n\t" + "ldr x16, [%[te], #192]\n\t" + "ldr x16, [%[te], #256]\n\t" + "ldr x16, [%[te], #320]\n\t" + "ldr x16, [%[te], #384]\n\t" + "ldr x16, [%[te], #448]\n\t" + "ldr x16, [%[te], #512]\n\t" + "ldr x16, [%[te], #576]\n\t" + "ldr x16, [%[te], #640]\n\t" + "ldr x16, [%[te], #704]\n\t" + "ldr x16, [%[te], #768]\n\t" + "ldr x16, [%[te], #832]\n\t" + "ldr x16, [%[te], #896]\n\t" + "ldr x16, [%[te], #960]\n\t" + "ldr w10, [%[te], x10, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x11, x15, #16, #8\n\t" + "eor w10, w10, w13, ror 24\n\t" + "ubfx x13, x14, #56, #8\n\t" + "eor w10, w10, w19, ror 8\n\t" + "ubfx x19, x15, #40, #8\n\t" + "eor w10, w10, w20, ror 16\n\t" + "ubfx x20, x14, #0, #8\n\t" + "ldr w11, [%[te], x11, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x15, #48, #8\n\t" + "eor w11, w11, w13, ror 24\n\t" + "ubfx x13, x15, #24, #8\n\t" + "eor w11, w11, w19, ror 8\n\t" + "ubfx x19, x14, #8, #8\n\t" + "eor w11, w11, w20, ror 16\n\t" + "ubfx x20, x14, #32, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x15, #0, #8\n\t" + "eor w12, w12, w13, ror 24\n\t" + "ubfx x13, x14, #16, #8\n\t" + "eor w12, w12, w19, ror 8\n\t" + "ubfx x19, x15, #56, #8\n\t" + "eor w11, w12, w20, ror 16\n\t" + "ubfx x20, x14, #40, #8\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w13, [%[te], x13, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w16, ror 24\n\t" + "ldp x14, x15, [x26], #16\n\t" + "eor w13, w13, w19, ror 24\n\t" + "eor w13, w13, w20, ror 8\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "subs w25, w25, #2\n\t" + "b.ne L_AES_XTS_encrypt_loop_nr_partial_%=\n\t" + "ubfx x14, x10, #48, #8\n\t" + "ubfx x17, x10, #24, #8\n\t" + "ubfx x19, x11, #8, #8\n\t" + "ubfx x20, x11, #32, #8\n\t" + "ldr x12, [%[te]]\n\t" + "ldr x12, [%[te], #64]\n\t" + "ldr x12, [%[te], #128]\n\t" + "ldr x12, [%[te], #192]\n\t" + "ldr x12, [%[te], #256]\n\t" + "ldr x12, [%[te], #320]\n\t" + "ldr x12, [%[te], #384]\n\t" + "ldr x12, [%[te], #448]\n\t" + "ldr x12, [%[te], #512]\n\t" + "ldr x12, [%[te], #576]\n\t" + "ldr x12, [%[te], #640]\n\t" + "ldr x12, [%[te], #704]\n\t" + "ldr x12, [%[te], #768]\n\t" + "ldr x12, [%[te], #832]\n\t" + "ldr x12, [%[te], #896]\n\t" + "ldr x12, [%[te], #960]\n\t" + "ldr w14, [%[te], x14, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x15, x11, #16, #8\n\t" + "eor w14, w14, w17, ror 24\n\t" + "ubfx x17, x10, #56, #8\n\t" + "eor w14, w14, w19, ror 8\n\t" + "ubfx x19, x11, #40, #8\n\t" + "eor w14, w14, w20, ror 16\n\t" + "ubfx x20, x10, #0, #8\n\t" + "ldr w15, [%[te], x15, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x16, x11, #48, #8\n\t" + "eor w15, w15, w17, ror 24\n\t" + "ubfx x17, x11, #24, #8\n\t" + "eor w15, w15, w19, ror 8\n\t" + "ubfx x19, x10, #8, #8\n\t" + "eor w15, w15, w20, ror 16\n\t" + "ubfx x20, x10, #32, #8\n\t" + "bfi x14, x15, #32, #32\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ubfx x12, x11, #0, #8\n\t" + "eor w16, w16, w17, ror 24\n\t" + "ubfx x17, x10, #16, #8\n\t" + "eor w16, w16, w19, ror 8\n\t" + "ubfx x19, x11, #56, #8\n\t" + "eor w15, w16, w20, ror 16\n\t" + "ubfx x20, x10, #40, #8\n\t" + "ldr w12, [%[te], x12, LSL 2]\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "eor w19, w19, w12, ror 24\n\t" + "ldp x10, x11, [x26], #16\n\t" + "eor w17, w17, w19, ror 24\n\t" + "eor w17, w17, w20, ror 8\n\t" + "bfi x15, x17, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x14, x14, x10\n\t" + "eor x15, x15, x11\n\t" + "ubfx x10, x15, #32, #8\n\t" + "ubfx x13, x15, #8, #8\n\t" + "ubfx x19, x14, #48, #8\n\t" + "ubfx x20, x14, #24, #8\n\t" + "lsl w10, w10, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldr x17, [%[te]]\n\t" + "ldr x17, [%[te], #64]\n\t" + "ldr x17, [%[te], #128]\n\t" + "ldr x17, [%[te], #192]\n\t" + "ldr x17, [%[te], #256]\n\t" + "ldr x17, [%[te], #320]\n\t" + "ldr x17, [%[te], #384]\n\t" + "ldr x17, [%[te], #448]\n\t" + "ldr x17, [%[te], #512]\n\t" + "ldr x17, [%[te], #576]\n\t" + "ldr x17, [%[te], #640]\n\t" + "ldr x17, [%[te], #704]\n\t" + "ldr x17, [%[te], #768]\n\t" + "ldr x17, [%[te], #832]\n\t" + "ldr x17, [%[te], #896]\n\t" + "ldr x17, [%[te], #960]\n\t" + "ldrb w10, [%[te], x10, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x11, x14, #0, #8\n\t" + "eor w10, w10, w13, lsl 8\n\t" + "ubfx x13, x15, #40, #8\n\t" + "eor w10, w10, w19, lsl 16\n\t" + "ubfx x19, x15, #16, #8\n\t" + "eor w10, w10, w20, lsl 24\n\t" + "ubfx x20, x14, #56, #8\n\t" + "lsl w11, w11, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w11, [%[te], x11, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x12, x14, #32, #8\n\t" + "eor w11, w11, w13, lsl 8\n\t" + "ubfx x13, x14, #8, #8\n\t" + "eor w11, w11, w19, lsl 16\n\t" + "ubfx x19, x15, #48, #8\n\t" + "eor w11, w11, w20, lsl 24\n\t" + "ubfx x20, x15, #24, #8\n\t" + "bfi x10, x11, #32, #32\n\t" + "lsl w12, w12, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w12, [%[te], x12, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ubfx x17, x15, #56, #8\n\t" + "eor w12, w12, w13, lsl 8\n\t" + "ubfx x13, x15, #0, #8\n\t" + "eor w12, w12, w19, lsl 16\n\t" + "ubfx x19, x14, #40, #8\n\t" + "eor w11, w12, w20, lsl 24\n\t" + "ubfx x20, x14, #16, #8\n\t" + "lsl w17, w17, #2\n\t" + "lsl w13, w13, #2\n\t" + "lsl w19, w19, #2\n\t" + "lsl w20, w20, #2\n\t" + "ldrb w17, [%[te], x17, LSL 0]\n\t" + "ldrb w13, [%[te], x13, LSL 0]\n\t" + "ldrb w19, [%[te], x19, LSL 0]\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "eor w19, w19, w17, lsl 16\n\t" + "ldp x14, x15, [x26]\n\t" + "eor w13, w13, w19, lsl 8\n\t" + "eor w13, w13, w20, lsl 16\n\t" + "bfi x11, x13, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x10, x10, x14\n\t" + "eor x11, x11, x15\n\t" + "rev32 x10, x10\n\t" + "rev32 x11, x11\n\t" + "eor x10, x10, x21\n\t" + "eor x11, x11, x22\n\t" + "stp x10, x11, [%x[out]]\n\t" + "\n" + "L_AES_XTS_encrypt_done_data_%=: \n\t" + "ldp x29, x30, [sp], #32\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), + [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) + : [in] "r" (in), [i] "r" (i), [te] "r" (te) + : "memory", "cc", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", + "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26" + ); +} + +#ifdef HAVE_AES_DECRYPT +void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, const byte* i, + byte* key, byte* key2, byte* tmp, int nr) +{ + const word32* td = L_AES_ARM64_td; + const word8* td4 = L_AES_ARM64_td4; + const word32* te = L_AES_ARM64_te; + __asm__ __volatile__ ( + "stp x29, x30, [sp, #-32]!\n\t" + "add x29, sp, #0\n\t" + "ands w11, %w[sz], #15\n\t" + "cset w11, ne\n\t" + "lsl w11, w11, #4\n\t" + "sub %w[sz], %w[sz], w11\n\t" + "mov x11, #0x87\n\t" + "mov x28, %x[key2]\n\t" + "ldp x23, x24, [%x[i]]\n\t" + "ldp x16, x17, [x28], #16\n\t" + "rev32 x23, x23\n\t" + "rev32 x24, x24\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x23, x23, x16\n\t" + "eor x24, x24, x17\n\t" + "sub w27, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_loop_nr_tweak_%=: \n\t" + "ubfx x16, x23, #48, #8\n\t" + "ubfx x20, x23, #24, #8\n\t" + "ubfx x21, x24, #8, #8\n\t" + "ubfx x22, x24, #32, #8\n\t" + "ldr x25, [%[te]]\n\t" + "ldr x25, [%[te], #64]\n\t" + "ldr x25, [%[te], #128]\n\t" + "ldr x25, [%[te], #192]\n\t" + "ldr x25, [%[te], #256]\n\t" + "ldr x25, [%[te], #320]\n\t" + "ldr x25, [%[te], #384]\n\t" + "ldr x25, [%[te], #448]\n\t" + "ldr x25, [%[te], #512]\n\t" + "ldr x25, [%[te], #576]\n\t" + "ldr x25, [%[te], #640]\n\t" + "ldr x25, [%[te], #704]\n\t" + "ldr x25, [%[te], #768]\n\t" + "ldr x25, [%[te], #832]\n\t" + "ldr x25, [%[te], #896]\n\t" + "ldr x25, [%[te], #960]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x17, x24, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x23, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x24, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x23, #0, #8\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x19, x24, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x24, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x23, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x23, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x25, x24, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x23, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x24, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x23, #40, #8\n\t" + "ldr w25, [%[te], x25, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "eor w21, w21, w25, ror 24\n\t" + "ldp x23, x24, [x28], #16\n\t" + "eor w20, w20, w21, ror 24\n\t" + "eor w20, w20, w22, ror 8\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x23\n\t" + "eor x17, x17, x24\n\t" + "ubfx x23, x16, #48, #8\n\t" + "ubfx x26, x16, #24, #8\n\t" + "ubfx x21, x17, #8, #8\n\t" + "ubfx x22, x17, #32, #8\n\t" + "ldr x19, [%[te]]\n\t" + "ldr x19, [%[te], #64]\n\t" + "ldr x19, [%[te], #128]\n\t" + "ldr x19, [%[te], #192]\n\t" + "ldr x19, [%[te], #256]\n\t" + "ldr x19, [%[te], #320]\n\t" + "ldr x19, [%[te], #384]\n\t" + "ldr x19, [%[te], #448]\n\t" + "ldr x19, [%[te], #512]\n\t" + "ldr x19, [%[te], #576]\n\t" + "ldr x19, [%[te], #640]\n\t" + "ldr x19, [%[te], #704]\n\t" + "ldr x19, [%[te], #768]\n\t" + "ldr x19, [%[te], #832]\n\t" + "ldr x19, [%[te], #896]\n\t" + "ldr x19, [%[te], #960]\n\t" + "ldr w23, [%[te], x23, LSL 2]\n\t" + "ldr w26, [%[te], x26, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x24, x17, #16, #8\n\t" + "eor w23, w23, w26, ror 24\n\t" + "ubfx x26, x16, #56, #8\n\t" + "eor w23, w23, w21, ror 8\n\t" + "ubfx x21, x17, #40, #8\n\t" + "eor w23, w23, w22, ror 16\n\t" + "ubfx x22, x16, #0, #8\n\t" + "ldr w24, [%[te], x24, LSL 2]\n\t" + "ldr w26, [%[te], x26, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x25, x17, #48, #8\n\t" + "eor w24, w24, w26, ror 24\n\t" + "ubfx x26, x17, #24, #8\n\t" + "eor w24, w24, w21, ror 8\n\t" + "ubfx x21, x16, #8, #8\n\t" + "eor w24, w24, w22, ror 16\n\t" + "ubfx x22, x16, #32, #8\n\t" + "bfi x23, x24, #32, #32\n\t" + "ldr w25, [%[te], x25, LSL 2]\n\t" + "ldr w26, [%[te], x26, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x19, x17, #0, #8\n\t" + "eor w25, w25, w26, ror 24\n\t" + "ubfx x26, x16, #16, #8\n\t" + "eor w25, w25, w21, ror 8\n\t" + "ubfx x21, x17, #56, #8\n\t" + "eor w24, w25, w22, ror 16\n\t" + "ubfx x22, x16, #40, #8\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w26, [%[te], x26, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "eor w21, w21, w19, ror 24\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor w26, w26, w21, ror 24\n\t" + "eor w26, w26, w22, ror 8\n\t" + "bfi x24, x26, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x23, x23, x16\n\t" + "eor x24, x24, x17\n\t" + "subs w27, w27, #2\n\t" + "b.ne L_AES_XTS_decrypt_loop_nr_tweak_%=\n\t" + "ubfx x16, x23, #48, #8\n\t" + "ubfx x20, x23, #24, #8\n\t" + "ubfx x21, x24, #8, #8\n\t" + "ubfx x22, x24, #32, #8\n\t" + "ldr x25, [%[te]]\n\t" + "ldr x25, [%[te], #64]\n\t" + "ldr x25, [%[te], #128]\n\t" + "ldr x25, [%[te], #192]\n\t" + "ldr x25, [%[te], #256]\n\t" + "ldr x25, [%[te], #320]\n\t" + "ldr x25, [%[te], #384]\n\t" + "ldr x25, [%[te], #448]\n\t" + "ldr x25, [%[te], #512]\n\t" + "ldr x25, [%[te], #576]\n\t" + "ldr x25, [%[te], #640]\n\t" + "ldr x25, [%[te], #704]\n\t" + "ldr x25, [%[te], #768]\n\t" + "ldr x25, [%[te], #832]\n\t" + "ldr x25, [%[te], #896]\n\t" + "ldr x25, [%[te], #960]\n\t" + "ldr w16, [%[te], x16, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x17, x24, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x23, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x24, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x23, #0, #8\n\t" + "ldr w17, [%[te], x17, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x19, x24, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x24, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x23, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x23, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[te], x19, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "ubfx x25, x24, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x23, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x24, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x23, #40, #8\n\t" + "ldr w25, [%[te], x25, LSL 2]\n\t" + "ldr w21, [%[te], x21, LSL 2]\n\t" + "ldr w20, [%[te], x20, LSL 2]\n\t" + "ldr w22, [%[te], x22, LSL 2]\n\t" + "eor w21, w21, w25, ror 24\n\t" + "ldp x23, x24, [x28], #16\n\t" + "eor w20, w20, w21, ror 24\n\t" + "eor w20, w20, w22, ror 8\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x23\n\t" + "eor x17, x17, x24\n\t" + "ubfx x23, x17, #32, #8\n\t" + "ubfx x26, x17, #8, #8\n\t" + "ubfx x21, x16, #48, #8\n\t" + "ubfx x22, x16, #24, #8\n\t" + "lsl w23, w23, #2\n\t" + "lsl w26, w26, #2\n\t" + "lsl w21, w21, #2\n\t" + "lsl w22, w22, #2\n\t" + "ldr x20, [%[te]]\n\t" + "ldr x20, [%[te], #64]\n\t" + "ldr x20, [%[te], #128]\n\t" + "ldr x20, [%[te], #192]\n\t" + "ldr x20, [%[te], #256]\n\t" + "ldr x20, [%[te], #320]\n\t" + "ldr x20, [%[te], #384]\n\t" + "ldr x20, [%[te], #448]\n\t" + "ldr x20, [%[te], #512]\n\t" + "ldr x20, [%[te], #576]\n\t" + "ldr x20, [%[te], #640]\n\t" + "ldr x20, [%[te], #704]\n\t" + "ldr x20, [%[te], #768]\n\t" + "ldr x20, [%[te], #832]\n\t" + "ldr x20, [%[te], #896]\n\t" + "ldr x20, [%[te], #960]\n\t" + "ldrb w23, [%[te], x23, LSL 0]\n\t" + "ldrb w26, [%[te], x26, LSL 0]\n\t" + "ldrb w21, [%[te], x21, LSL 0]\n\t" + "ldrb w22, [%[te], x22, LSL 0]\n\t" + "ubfx x24, x16, #0, #8\n\t" + "eor w23, w23, w26, lsl 8\n\t" + "ubfx x26, x17, #40, #8\n\t" + "eor w23, w23, w21, lsl 16\n\t" + "ubfx x21, x17, #16, #8\n\t" + "eor w23, w23, w22, lsl 24\n\t" + "ubfx x22, x16, #56, #8\n\t" + "lsl w24, w24, #2\n\t" + "lsl w26, w26, #2\n\t" + "lsl w21, w21, #2\n\t" + "lsl w22, w22, #2\n\t" + "ldrb w24, [%[te], x24, LSL 0]\n\t" + "ldrb w26, [%[te], x26, LSL 0]\n\t" + "ldrb w21, [%[te], x21, LSL 0]\n\t" + "ldrb w22, [%[te], x22, LSL 0]\n\t" + "ubfx x25, x16, #32, #8\n\t" + "eor w24, w24, w26, lsl 8\n\t" + "ubfx x26, x16, #8, #8\n\t" + "eor w24, w24, w21, lsl 16\n\t" + "ubfx x21, x17, #48, #8\n\t" + "eor w24, w24, w22, lsl 24\n\t" + "ubfx x22, x17, #24, #8\n\t" + "bfi x23, x24, #32, #32\n\t" + "lsl w25, w25, #2\n\t" + "lsl w26, w26, #2\n\t" + "lsl w21, w21, #2\n\t" + "lsl w22, w22, #2\n\t" + "ldrb w25, [%[te], x25, LSL 0]\n\t" + "ldrb w26, [%[te], x26, LSL 0]\n\t" + "ldrb w21, [%[te], x21, LSL 0]\n\t" + "ldrb w22, [%[te], x22, LSL 0]\n\t" + "ubfx x20, x17, #56, #8\n\t" + "eor w25, w25, w26, lsl 8\n\t" + "ubfx x26, x17, #0, #8\n\t" + "eor w25, w25, w21, lsl 16\n\t" + "ubfx x21, x16, #40, #8\n\t" + "eor w24, w25, w22, lsl 24\n\t" + "ubfx x22, x16, #16, #8\n\t" + "lsl w20, w20, #2\n\t" + "lsl w26, w26, #2\n\t" + "lsl w21, w21, #2\n\t" + "lsl w22, w22, #2\n\t" + "ldrb w20, [%[te], x20, LSL 0]\n\t" + "ldrb w26, [%[te], x26, LSL 0]\n\t" + "ldrb w21, [%[te], x21, LSL 0]\n\t" + "ldrb w22, [%[te], x22, LSL 0]\n\t" + "eor w21, w21, w20, lsl 16\n\t" + "ldp x16, x17, [x28]\n\t" + "eor w26, w26, w21, lsl 8\n\t" + "eor w26, w26, w22, lsl 16\n\t" + "bfi x24, x26, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x23, x23, x16\n\t" + "eor x24, x24, x17\n\t" + "rev32 x23, x23\n\t" + "rev32 x24, x24\n\t" + "cmp %w[sz], #16\n\t" + "b.lt L_AES_XTS_decrypt_start_partail_%=\n\t" + "\n" + "L_AES_XTS_decrypt_loop_block_%=: \n\t" + "mov x28, %x[key]\n\t" + "ldp x12, x13, [%x[in]]\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor x12, x12, x23\n\t" + "eor x13, x13, x24\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "sub w27, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_loop_nr_%=: \n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x17, #48, #8\n\t" + "ubfx x15, x16, #24, #8\n\t" + "ubfx x21, x17, #8, #8\n\t" + "ubfx x22, x16, #32, #8\n\t" + "ldr x19, [%[td]]\n\t" + "ldr x19, [%[td], #64]\n\t" + "ldr x19, [%[td], #128]\n\t" + "ldr x19, [%[td], #192]\n\t" + "ldr x19, [%[td], #256]\n\t" + "ldr x19, [%[td], #320]\n\t" + "ldr x19, [%[td], #384]\n\t" + "ldr x19, [%[td], #448]\n\t" + "ldr x19, [%[td], #512]\n\t" + "ldr x19, [%[td], #576]\n\t" + "ldr x19, [%[td], #640]\n\t" + "ldr x19, [%[td], #704]\n\t" + "ldr x19, [%[td], #768]\n\t" + "ldr x19, [%[td], #832]\n\t" + "ldr x19, [%[td], #896]\n\t" + "ldr x19, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x13, x16, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x16, #56, #8\n\t" + "eor w12, w12, w21, ror 8\n\t" + "ubfx x21, x17, #40, #8\n\t" + "eor w12, w12, w22, ror 16\n\t" + "ubfx x22, x17, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x16, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x17, #24, #8\n\t" + "eor w13, w13, w21, ror 8\n\t" + "ubfx x21, x16, #8, #8\n\t" + "eor w13, w13, w22, ror 16\n\t" + "ubfx x22, x17, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x16, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x17, #16, #8\n\t" + "eor w14, w14, w21, ror 8\n\t" + "ubfx x21, x17, #56, #8\n\t" + "eor w13, w14, w22, ror 16\n\t" + "ubfx x22, x16, #40, #8\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w19, ror 24\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor w15, w15, w22, ror 8\n\t" + "eor w15, w15, w21, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "subs w27, w27, #2\n\t" + "b.ne L_AES_XTS_decrypt_loop_nr_%=\n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x16, #32, #8\n\t" + "ubfx x15, x17, #8, #8\n\t" + "ubfx x21, x17, #48, #8\n\t" + "ubfx x22, x16, #24, #8\n\t" + "ldr x20, [%[td4]]\n\t" + "ldr x20, [%[td4], #64]\n\t" + "ldr x20, [%[td4], #128]\n\t" + "ldr x20, [%[td4], #192]\n\t" + "ldr x20, [%[td4], #256]\n\t" + "ldr x20, [%[td4], #320]\n\t" + "ldr x20, [%[td4], #384]\n\t" + "ldr x20, [%[td4], #448]\n\t" + "ldr x20, [%[td4], #512]\n\t" + "ldr x20, [%[td4], #576]\n\t" + "ldr x20, [%[td4], #640]\n\t" + "ldr x20, [%[td4], #704]\n\t" + "ldr x20, [%[td4], #768]\n\t" + "ldr x20, [%[td4], #832]\n\t" + "ldr x20, [%[td4], #896]\n\t" + "ldr x20, [%[td4], #960]\n\t" + "ldrb w12, [%[td4], x12, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ubfx x13, x17, #0, #8\n\t" + "eor w12, w12, w15, lsl 8\n\t" + "ubfx x15, x17, #40, #8\n\t" + "eor w12, w12, w21, lsl 16\n\t" + "ubfx x21, x16, #16, #8\n\t" + "eor w12, w12, w22, lsl 24\n\t" + "ubfx x22, x16, #56, #8\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w13, [%[td4], x13, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x14, x17, #32, #8\n\t" + "eor w13, w13, w15, lsl 8\n\t" + "ubfx x15, x16, #8, #8\n\t" + "eor w13, w13, w21, lsl 16\n\t" + "ubfx x21, x16, #48, #8\n\t" + "eor w13, w13, w22, lsl 24\n\t" + "ubfx x22, x17, #24, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w14, [%[td4], x14, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x20, x17, #56, #8\n\t" + "eor w14, w14, w15, lsl 8\n\t" + "ubfx x15, x16, #0, #8\n\t" + "eor w14, w14, w21, lsl 16\n\t" + "ubfx x21, x16, #40, #8\n\t" + "eor w13, w14, w22, lsl 24\n\t" + "ubfx x22, x17, #16, #8\n\t" + "ldrb w20, [%[td4], x20, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "eor w21, w21, w20, lsl 16\n\t" + "ldp x16, x17, [x28]\n\t" + "eor w15, w15, w21, lsl 8\n\t" + "eor w15, w15, w22, lsl 16\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + "eor x12, x12, x23\n\t" + "eor x13, x13, x24\n\t" + "stp x12, x13, [%x[out]]\n\t" + "and x21, x11, x24, asr 63\n\t" + "extr x24, x24, x23, #63\n\t" + "eor x23, x21, x23, lsl 1\n\t" + "sub %w[sz], %w[sz], #16\n\t" + "add %x[in], %x[in], #16\n\t" + "add %x[out], %x[out], #16\n\t" + "cmp %w[sz], #16\n\t" + "b.ge L_AES_XTS_decrypt_loop_block_%=\n\t" + "cbz %w[sz], L_AES_XTS_decrypt_done_data_%=\n\t" + "\n" + "L_AES_XTS_decrypt_start_partail_%=: \n\t" + "and x21, x11, x24, asr 63\n\t" + "extr x26, x24, x23, #63\n\t" + "eor x25, x21, x23, lsl 1\n\t" + "mov x28, %x[key]\n\t" + "ldp x12, x13, [%x[in]], #16\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor x12, x12, x25\n\t" + "eor x13, x13, x26\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "sub w27, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_loop_nr_partial_1_%=: \n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x17, #48, #8\n\t" + "ubfx x15, x16, #24, #8\n\t" + "ubfx x21, x17, #8, #8\n\t" + "ubfx x22, x16, #32, #8\n\t" + "ldr x19, [%[td]]\n\t" + "ldr x19, [%[td], #64]\n\t" + "ldr x19, [%[td], #128]\n\t" + "ldr x19, [%[td], #192]\n\t" + "ldr x19, [%[td], #256]\n\t" + "ldr x19, [%[td], #320]\n\t" + "ldr x19, [%[td], #384]\n\t" + "ldr x19, [%[td], #448]\n\t" + "ldr x19, [%[td], #512]\n\t" + "ldr x19, [%[td], #576]\n\t" + "ldr x19, [%[td], #640]\n\t" + "ldr x19, [%[td], #704]\n\t" + "ldr x19, [%[td], #768]\n\t" + "ldr x19, [%[td], #832]\n\t" + "ldr x19, [%[td], #896]\n\t" + "ldr x19, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x13, x16, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x16, #56, #8\n\t" + "eor w12, w12, w21, ror 8\n\t" + "ubfx x21, x17, #40, #8\n\t" + "eor w12, w12, w22, ror 16\n\t" + "ubfx x22, x17, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x16, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x17, #24, #8\n\t" + "eor w13, w13, w21, ror 8\n\t" + "ubfx x21, x16, #8, #8\n\t" + "eor w13, w13, w22, ror 16\n\t" + "ubfx x22, x17, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x16, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x17, #16, #8\n\t" + "eor w14, w14, w21, ror 8\n\t" + "ubfx x21, x17, #56, #8\n\t" + "eor w13, w14, w22, ror 16\n\t" + "ubfx x22, x16, #40, #8\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w19, ror 24\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor w15, w15, w22, ror 8\n\t" + "eor w15, w15, w21, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "subs w27, w27, #2\n\t" + "b.ne L_AES_XTS_decrypt_loop_nr_partial_1_%=\n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x16, #32, #8\n\t" + "ubfx x15, x17, #8, #8\n\t" + "ubfx x21, x17, #48, #8\n\t" + "ubfx x22, x16, #24, #8\n\t" + "ldr x20, [%[td4]]\n\t" + "ldr x20, [%[td4], #64]\n\t" + "ldr x20, [%[td4], #128]\n\t" + "ldr x20, [%[td4], #192]\n\t" + "ldr x20, [%[td4], #256]\n\t" + "ldr x20, [%[td4], #320]\n\t" + "ldr x20, [%[td4], #384]\n\t" + "ldr x20, [%[td4], #448]\n\t" + "ldr x20, [%[td4], #512]\n\t" + "ldr x20, [%[td4], #576]\n\t" + "ldr x20, [%[td4], #640]\n\t" + "ldr x20, [%[td4], #704]\n\t" + "ldr x20, [%[td4], #768]\n\t" + "ldr x20, [%[td4], #832]\n\t" + "ldr x20, [%[td4], #896]\n\t" + "ldr x20, [%[td4], #960]\n\t" + "ldrb w12, [%[td4], x12, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ubfx x13, x17, #0, #8\n\t" + "eor w12, w12, w15, lsl 8\n\t" + "ubfx x15, x17, #40, #8\n\t" + "eor w12, w12, w21, lsl 16\n\t" + "ubfx x21, x16, #16, #8\n\t" + "eor w12, w12, w22, lsl 24\n\t" + "ubfx x22, x16, #56, #8\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w13, [%[td4], x13, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x14, x17, #32, #8\n\t" + "eor w13, w13, w15, lsl 8\n\t" + "ubfx x15, x16, #8, #8\n\t" + "eor w13, w13, w21, lsl 16\n\t" + "ubfx x21, x16, #48, #8\n\t" + "eor w13, w13, w22, lsl 24\n\t" + "ubfx x22, x17, #24, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w14, [%[td4], x14, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x20, x17, #56, #8\n\t" + "eor w14, w14, w15, lsl 8\n\t" + "ubfx x15, x16, #0, #8\n\t" + "eor w14, w14, w21, lsl 16\n\t" + "ubfx x21, x16, #40, #8\n\t" + "eor w13, w14, w22, lsl 24\n\t" + "ubfx x22, x17, #16, #8\n\t" + "ldrb w20, [%[td4], x20, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "eor w21, w21, w20, lsl 16\n\t" + "ldp x16, x17, [x28]\n\t" + "eor w15, w15, w21, lsl 8\n\t" + "eor w15, w15, w22, lsl 16\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + "eor x12, x12, x25\n\t" + "eor x13, x13, x26\n\t" + "stp x12, x13, [%x[tmp]]\n\t" + "add %x[out], %x[out], #16\n\t" + "mov w16, %w[sz]\n\t" + "\n" + "L_AES_XTS_decrypt_start_byte_%=: \n\t" + "ldrb w21, [%x[tmp]]\n\t" + "ldrb w22, [%x[in]], #1\n\t" + "strb w21, [%x[out]], #1\n\t" + "strb w22, [%x[tmp]], #1\n\t" + "subs w16, w16, #1\n\t" + "b.gt L_AES_XTS_decrypt_start_byte_%=\n\t" + "sub %x[out], %x[out], %x[sz]\n\t" + "sub %x[tmp], %x[tmp], %x[sz]\n\t" + "sub %x[out], %x[out], #16\n\t" + "mov x28, %x[key]\n\t" + "ldp x12, x13, [%x[tmp]]\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor x12, x12, x23\n\t" + "eor x13, x13, x24\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + /* Round: 0 - XOR in key schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "sub w27, %w[nr], #2\n\t" + "\n" + "L_AES_XTS_decrypt_loop_nr_partial_2_%=: \n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x17, #48, #8\n\t" + "ubfx x15, x16, #24, #8\n\t" + "ubfx x21, x17, #8, #8\n\t" + "ubfx x22, x16, #32, #8\n\t" + "ldr x19, [%[td]]\n\t" + "ldr x19, [%[td], #64]\n\t" + "ldr x19, [%[td], #128]\n\t" + "ldr x19, [%[td], #192]\n\t" + "ldr x19, [%[td], #256]\n\t" + "ldr x19, [%[td], #320]\n\t" + "ldr x19, [%[td], #384]\n\t" + "ldr x19, [%[td], #448]\n\t" + "ldr x19, [%[td], #512]\n\t" + "ldr x19, [%[td], #576]\n\t" + "ldr x19, [%[td], #640]\n\t" + "ldr x19, [%[td], #704]\n\t" + "ldr x19, [%[td], #768]\n\t" + "ldr x19, [%[td], #832]\n\t" + "ldr x19, [%[td], #896]\n\t" + "ldr x19, [%[td], #960]\n\t" + "ldr w12, [%[td], x12, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x13, x16, #16, #8\n\t" + "eor w12, w12, w15, ror 24\n\t" + "ubfx x15, x16, #56, #8\n\t" + "eor w12, w12, w21, ror 8\n\t" + "ubfx x21, x17, #40, #8\n\t" + "eor w12, w12, w22, ror 16\n\t" + "ubfx x22, x17, #0, #8\n\t" + "ldr w13, [%[td], x13, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x16, #48, #8\n\t" + "eor w13, w13, w15, ror 24\n\t" + "ubfx x15, x17, #24, #8\n\t" + "eor w13, w13, w21, ror 8\n\t" + "ubfx x21, x16, #8, #8\n\t" + "eor w13, w13, w22, ror 16\n\t" + "ubfx x22, x17, #32, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x16, #0, #8\n\t" + "eor w14, w14, w15, ror 24\n\t" + "ubfx x15, x17, #16, #8\n\t" + "eor w14, w14, w21, ror 8\n\t" + "ubfx x21, x17, #56, #8\n\t" + "eor w13, w14, w22, ror 16\n\t" + "ubfx x22, x16, #40, #8\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w15, [%[td], x15, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w19, ror 24\n\t" + "ldp x16, x17, [x28], #16\n\t" + "eor w15, w15, w22, ror 8\n\t" + "eor w15, w15, w21, ror 24\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "subs w27, w27, #2\n\t" + "b.ne L_AES_XTS_decrypt_loop_nr_partial_2_%=\n\t" + "ubfx x16, x13, #48, #8\n\t" + "ubfx x20, x12, #24, #8\n\t" + "ubfx x21, x13, #8, #8\n\t" + "ubfx x22, x12, #32, #8\n\t" + "ldr x14, [%[td]]\n\t" + "ldr x14, [%[td], #64]\n\t" + "ldr x14, [%[td], #128]\n\t" + "ldr x14, [%[td], #192]\n\t" + "ldr x14, [%[td], #256]\n\t" + "ldr x14, [%[td], #320]\n\t" + "ldr x14, [%[td], #384]\n\t" + "ldr x14, [%[td], #448]\n\t" + "ldr x14, [%[td], #512]\n\t" + "ldr x14, [%[td], #576]\n\t" + "ldr x14, [%[td], #640]\n\t" + "ldr x14, [%[td], #704]\n\t" + "ldr x14, [%[td], #768]\n\t" + "ldr x14, [%[td], #832]\n\t" + "ldr x14, [%[td], #896]\n\t" + "ldr x14, [%[td], #960]\n\t" + "ldr w16, [%[td], x16, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x17, x12, #16, #8\n\t" + "eor w16, w16, w20, ror 24\n\t" + "ubfx x20, x12, #56, #8\n\t" + "eor w16, w16, w21, ror 8\n\t" + "ubfx x21, x13, #40, #8\n\t" + "eor w16, w16, w22, ror 16\n\t" + "ubfx x22, x13, #0, #8\n\t" + "ldr w17, [%[td], x17, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x19, x12, #48, #8\n\t" + "eor w17, w17, w20, ror 24\n\t" + "ubfx x20, x13, #24, #8\n\t" + "eor w17, w17, w21, ror 8\n\t" + "ubfx x21, x12, #8, #8\n\t" + "eor w17, w17, w22, ror 16\n\t" + "ubfx x22, x13, #32, #8\n\t" + "bfi x16, x17, #32, #32\n\t" + "ldr w19, [%[td], x19, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "ubfx x14, x12, #0, #8\n\t" + "eor w19, w19, w20, ror 24\n\t" + "ubfx x20, x13, #16, #8\n\t" + "eor w19, w19, w21, ror 8\n\t" + "ubfx x21, x13, #56, #8\n\t" + "eor w17, w19, w22, ror 16\n\t" + "ubfx x22, x12, #40, #8\n\t" + "ldr w14, [%[td], x14, LSL 2]\n\t" + "ldr w21, [%[td], x21, LSL 2]\n\t" + "ldr w20, [%[td], x20, LSL 2]\n\t" + "ldr w22, [%[td], x22, LSL 2]\n\t" + "eor w21, w21, w14, ror 24\n\t" + "ldp x12, x13, [x28], #16\n\t" + "eor w20, w20, w22, ror 8\n\t" + "eor w20, w20, w21, ror 24\n\t" + "bfi x17, x20, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x16, x16, x12\n\t" + "eor x17, x17, x13\n\t" + "ubfx x12, x16, #32, #8\n\t" + "ubfx x15, x17, #8, #8\n\t" + "ubfx x21, x17, #48, #8\n\t" + "ubfx x22, x16, #24, #8\n\t" + "ldr x20, [%[td4]]\n\t" + "ldr x20, [%[td4], #64]\n\t" + "ldr x20, [%[td4], #128]\n\t" + "ldr x20, [%[td4], #192]\n\t" + "ldr x20, [%[td4], #256]\n\t" + "ldr x20, [%[td4], #320]\n\t" + "ldr x20, [%[td4], #384]\n\t" + "ldr x20, [%[td4], #448]\n\t" + "ldr x20, [%[td4], #512]\n\t" + "ldr x20, [%[td4], #576]\n\t" + "ldr x20, [%[td4], #640]\n\t" + "ldr x20, [%[td4], #704]\n\t" + "ldr x20, [%[td4], #768]\n\t" + "ldr x20, [%[td4], #832]\n\t" + "ldr x20, [%[td4], #896]\n\t" + "ldr x20, [%[td4], #960]\n\t" + "ldrb w12, [%[td4], x12, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ubfx x13, x17, #0, #8\n\t" + "eor w12, w12, w15, lsl 8\n\t" + "ubfx x15, x17, #40, #8\n\t" + "eor w12, w12, w21, lsl 16\n\t" + "ubfx x21, x16, #16, #8\n\t" + "eor w12, w12, w22, lsl 24\n\t" + "ubfx x22, x16, #56, #8\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w13, [%[td4], x13, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x14, x17, #32, #8\n\t" + "eor w13, w13, w15, lsl 8\n\t" + "ubfx x15, x16, #8, #8\n\t" + "eor w13, w13, w21, lsl 16\n\t" + "ubfx x21, x16, #48, #8\n\t" + "eor w13, w13, w22, lsl 24\n\t" + "ubfx x22, x17, #24, #8\n\t" + "bfi x12, x13, #32, #32\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "ldrb w14, [%[td4], x14, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ubfx x20, x17, #56, #8\n\t" + "eor w14, w14, w15, lsl 8\n\t" + "ubfx x15, x16, #0, #8\n\t" + "eor w14, w14, w21, lsl 16\n\t" + "ubfx x21, x16, #40, #8\n\t" + "eor w13, w14, w22, lsl 24\n\t" + "ubfx x22, x17, #16, #8\n\t" + "ldrb w20, [%[td4], x20, LSL 0]\n\t" + "ldrb w21, [%[td4], x21, LSL 0]\n\t" + "ldrb w15, [%[td4], x15, LSL 0]\n\t" + "ldrb w22, [%[td4], x22, LSL 0]\n\t" + "eor w21, w21, w20, lsl 16\n\t" + "ldp x16, x17, [x28]\n\t" + "eor w15, w15, w21, lsl 8\n\t" + "eor w15, w15, w22, lsl 16\n\t" + "bfi x13, x15, #32, #32\n\t" + /* XOR in Key Schedule */ + "eor x12, x12, x16\n\t" + "eor x13, x13, x17\n\t" + "rev32 x12, x12\n\t" + "rev32 x13, x13\n\t" + "eor x12, x12, x23\n\t" + "eor x13, x13, x24\n\t" + "stp x12, x13, [%x[out]]\n\t" + "\n" + "L_AES_XTS_decrypt_done_data_%=: \n\t" + "ldp x29, x30, [sp], #32\n\t" + : [out] "+r" (out), [sz] "+r" (sz), [key] "+r" (key), + [key2] "+r" (key2), [tmp] "+r" (tmp), [nr] "+r" (nr) + : [in] "r" (in), [i] "r" (i), [td] "r" (td), [td4] "r" (td4), + [te] "r" (te) + : "memory", "cc", "x11", "x12", "x13", "x14", "x15", "x16", "x17", + "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", + "x28" + ); +} + +#endif /* HAVE_AES_DECRYPT */ +#endif /* WOLFSSL_AES_XTS */ +#endif /* !WOLFSSL_ARMASM_NEON_NO_TABLE_LOOKUP */ #endif /* !defined(NO_AES) && defined(WOLFSSL_ARMASM) */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index f4da2a0b2..36c8e6ef4 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -888,9 +888,74 @@ WOLFSSL_API int wc_AesCtsDecryptFinal(Aes* aes, byte* out, word32* outSz); #endif -#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ - !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if defined(WOLFSSL_ARMASM) +#if defined(__aarch64__) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +WOLFSSL_LOCAL void AES_set_encrypt_key(const unsigned char* key, word32 len, + unsigned char* ks); +WOLFSSL_LOCAL void AES_invert_key(unsigned char* ks, word32 rounds); +WOLFSSL_LOCAL void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +WOLFSSL_LOCAL void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr); +WOLFSSL_LOCAL void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +WOLFSSL_LOCAL void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); +WOLFSSL_LOCAL void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +#if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) +/* in pre-C2x C, constness conflicts for dimensioned arrays can't be resolved. + */ +WOLFSSL_LOCAL void GCM_gmult_len(byte* x, const byte** m, + const unsigned char* data, unsigned long len); +#endif +WOLFSSL_LOCAL void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, + unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); +#if defined(WOLFSSL_AES_XTS) && defined(__aarch64__) +WOLFSSL_LOCAL void AES_XTS_encrypt(const byte* in, byte* out, word32 sz, + const byte* i, byte* key, byte* key2, byte* tmp, int nr); +WOLFSSL_LOCAL void AES_XTS_decrypt(const byte* in, byte* out, word32 sz, + const byte* i, byte* key, byte* key2, byte* tmp, int nr); +#endif +#endif /* __aarch64__ || WOLFSSL_ARMASM_NO_HW_CRYPTO */ + +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_NEON) +WOLFSSL_LOCAL void AES_set_encrypt_key_NEON(const unsigned char* key, + word32 len, unsigned char* ks); +WOLFSSL_LOCAL void AES_invert_key_NEON(unsigned char* ks, word32 rounds); +WOLFSSL_LOCAL void AES_ECB_encrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr); +WOLFSSL_LOCAL void AES_ECB_decrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr); +WOLFSSL_LOCAL void AES_CBC_encrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr, + unsigned char* iv); +WOLFSSL_LOCAL void AES_CBC_decrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr, + unsigned char* iv); +WOLFSSL_LOCAL void AES_CTR_encrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr, + unsigned char* ctr); +#if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) +/* in pre-C2x C, constness conflicts for dimensioned arrays can't be resolved. + */ +WOLFSSL_LOCAL void GCM_gmult_len_NEON(byte* x, const byte* h, + const unsigned char* data, unsigned long len); +#endif +WOLFSSL_LOCAL void AES_GCM_encrypt_NEON(const unsigned char* in, + unsigned char* out, unsigned long len, const unsigned char* ks, int nr, + unsigned char* ctr); +#endif + +#ifdef WOLFSSL_AES_XTS +WOLFSSL_LOCAL void AES_XTS_encrypt_NEON(const byte* in, byte* out, word32 sz, + const byte* i, byte* key, byte* key2, byte* tmp, int nr); +WOLFSSL_LOCAL void AES_XTS_decrypt_NEON(const byte* in, byte* out, word32 sz, + const byte* i, byte* key, byte* key2, byte* tmp, int nr); +#endif /* WOLFSSL_AES_XTS */ + +#if defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) WOLFSSL_LOCAL void AES_set_key_AARCH64(const byte* userKey, int keylen, byte* key, int dir); @@ -979,7 +1044,7 @@ WOLFSSL_LOCAL void AES_GCM_decrypt_final_AARCH64_EOR3(byte* tag, const byte* authTag, word32 tbytes, word32 nbytes, word32 abytes, byte* h, byte* initCtr, int* res); #endif -#endif +#endif /* WOLFSSL_AESGCM_STREAM */ #ifdef WOLFSSL_AES_XTS WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, @@ -987,31 +1052,9 @@ WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, WOLFSSL_LOCAL void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr); #endif /* WOLFSSL_AES_XTS */ -#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#endif /* __aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) -#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) -WOLFSSL_LOCAL void AES_set_key_AARCH32(const byte* userKey, int keylen, - byte* key, int dir); - -WOLFSSL_LOCAL void AES_encrypt_AARCH32(const byte* inBlock, byte* outBlock, - byte* key, int nr); -WOLFSSL_LOCAL void AES_decrypt_AARCH32(const byte* inBlock, byte* outBlock, - byte* key, int nr); -WOLFSSL_LOCAL void AES_encrypt_blocks_AARCH32(const byte* in, byte* out, - word32 sz, byte* key, int nr); -#endif - -#ifdef WOLFSSL_AES_XTS -WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH64(const byte* in, byte* out, - word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr); -WOLFSSL_LOCAL void AES_XTS_decrypt_AARCH64(const byte* in, byte* out, - word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr); -#endif /* WOLFSSL_AES_XTS */ -#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ - -#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM) -#if !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) +#if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) WOLFSSL_LOCAL void AES_set_key_AARCH32(const byte* userKey, int keylen, byte* key, int dir); @@ -1049,30 +1092,8 @@ WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH32(const byte* in, byte* out, WOLFSSL_LOCAL void AES_XTS_decrypt_AARCH32(const byte* in, byte* out, word32 sz, const byte* i, byte* key, byte* key2, byte* tmp, int nr); #endif /* WOLFSSL_AES_XTS */ -#else -WOLFSSL_LOCAL void AES_set_encrypt_key(const unsigned char* key, word32 len, - unsigned char* ks); -WOLFSSL_LOCAL void AES_invert_key(unsigned char* ks, word32 rounds); -WOLFSSL_LOCAL void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr); -WOLFSSL_LOCAL void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr); -WOLFSSL_LOCAL void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); -WOLFSSL_LOCAL void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr, unsigned char* iv); -WOLFSSL_LOCAL void AES_CTR_encrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); -#if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT) -/* in pre-C2x C, constness conflicts for dimensioned arrays can't be resolved. - */ -WOLFSSL_LOCAL void GCM_gmult_len(byte* x, const byte** m, - const unsigned char* data, unsigned long len); -#endif -WOLFSSL_LOCAL void AES_GCM_encrypt(const unsigned char* in, unsigned char* out, - unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr); -#endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */ -#endif +#endif /* !__aarch64__ && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ +#endif /* WOLFSSL_ARMASM */ #ifdef __cplusplus } /* extern "C" */