diff --git a/tests/api/test_chacha.c b/tests/api/test_chacha.c index 5a843a7e9..6da3266e3 100644 --- a/tests/api/test_chacha.c +++ b/tests/api/test_chacha.c @@ -186,3 +186,184 @@ int test_wc_Chacha_Process(void) return EXPECT_RESULT(); } /* END test_wc_Chacha_Process */ + +#define CHACHA_LEN 1024 +/* + * Testing wc_Chacha_Process() + */ +int test_wc_Chacha_Process_Chunking(void) +{ + EXPECT_DECLS; +#ifdef HAVE_CHACHA + ChaCha enc; + WC_DECLARE_VAR(plain, byte, CHACHA_LEN, NULL); + WC_DECLARE_VAR(cipher, byte, CHACHA_LEN, NULL); + byte key[CHACHA_MAX_KEY_SZ]; + byte iv[CHACHA_IV_BYTES]; + int i; + int cnt; + int sz; + const byte expected[CHACHA_LEN] = { + 0xbc, 0xf5, 0x3b, 0xf2, 0x75, 0x85, 0x9e, 0x0a, + 0x09, 0x58, 0x83, 0x50, 0x33, 0x12, 0x01, 0xa1, + 0xb4, 0xaf, 0x8a, 0xe8, 0x4d, 0x3d, 0xa5, 0x68, + 0xf7, 0x6d, 0x3e, 0xe0, 0x62, 0x7e, 0x62, 0x66, + 0xdd, 0x07, 0xe9, 0x36, 0x6f, 0x4d, 0xe9, 0x7a, + 0x16, 0x48, 0xa1, 0x83, 0x9e, 0x67, 0x4d, 0xa3, + 0xfe, 0x7e, 0x4a, 0x31, 0xdd, 0xb6, 0x50, 0x39, + 0xd2, 0x2b, 0x93, 0xf2, 0x4d, 0x51, 0x44, 0x42, + 0x5d, 0xf1, 0xd9, 0x24, 0xd7, 0xef, 0x4b, 0xa4, + 0xfd, 0x6a, 0x53, 0xa5, 0x1e, 0x4a, 0xc8, 0x68, + 0x11, 0x69, 0xc6, 0xbd, 0xe1, 0x59, 0xe4, 0xca, + 0x5b, 0xa9, 0x77, 0xfe, 0x4f, 0x82, 0x9f, 0xcf, + 0x55, 0x16, 0x3c, 0xd5, 0x83, 0xee, 0xc7, 0x53, + 0xaf, 0xca, 0x8a, 0xe2, 0xcf, 0xf1, 0x4b, 0x3b, + 0x44, 0xf6, 0xc9, 0x6c, 0x5b, 0xd3, 0x28, 0x8a, + 0x7e, 0x67, 0xaa, 0x9e, 0xad, 0xce, 0x96, 0xc4, + 0x6e, 0x95, 0x8c, 0xf8, 0xf6, 0xb6, 0x42, 0x8e, + 0xe7, 0xab, 0xc8, 0x2c, 0x66, 0x8b, 0x80, 0xcf, + 0x78, 0xfe, 0x35, 0x8b, 0x59, 0x18, 0x45, 0xcb, + 0x18, 0xd4, 0x09, 0x88, 0xa9, 0xf9, 0x27, 0xd1, + 0x3b, 0x9d, 0x2b, 0xff, 0x89, 0x21, 0xb0, 0xd2, + 0xa7, 0x7e, 0x35, 0x61, 0xae, 0x1c, 0xc3, 0x1c, + 0x07, 0x5c, 0x10, 0x5d, 0x71, 0x3a, 0x3a, 0xe8, + 0x4c, 0xba, 0x00, 0xde, 0xd1, 0xf9, 0xa1, 0xae, + 0x7b, 0x91, 0x9d, 0x66, 0x31, 0x18, 0x55, 0x39, + 0xec, 0x1d, 0x83, 0x85, 0x1e, 0x5b, 0x35, 0x17, + 0x2e, 0xbc, 0x7a, 0x22, 0x79, 0x09, 0xa7, 0x02, + 0xf7, 0x3b, 0x93, 0x2c, 0x89, 0x1b, 0x69, 0xde, + 0x80, 0xc8, 0xdf, 0xce, 0xf9, 0xcd, 0xc8, 0x58, + 0xd6, 0x4b, 0x65, 0x9a, 0xc4, 0x4f, 0x27, 0xdb, + 0x9a, 0x6c, 0x3a, 0xef, 0x20, 0x0b, 0x00, 0x5c, + 0x9f, 0x91, 0xc1, 0xf6, 0x80, 0x53, 0x6c, 0x42, + 0xe3, 0xd0, 0xfb, 0x3b, 0x23, 0x75, 0x45, 0xa7, + 0x5b, 0x9b, 0xaa, 0xcd, 0x1e, 0x03, 0x35, 0x68, + 0x17, 0xee, 0xff, 0xd7, 0x4f, 0x77, 0x2f, 0xd0, + 0x1d, 0x5e, 0x89, 0x16, 0x50, 0x6f, 0x22, 0x44, + 0x10, 0x64, 0x37, 0x66, 0x70, 0x7f, 0x4d, 0x58, + 0x36, 0xec, 0x56, 0x4e, 0xfd, 0x22, 0x8d, 0x77, + 0xb1, 0x37, 0x07, 0x13, 0xdf, 0x34, 0x40, 0x1c, + 0x65, 0x95, 0x9b, 0xb9, 0xac, 0x11, 0xfe, 0x7a, + 0xae, 0x1f, 0x17, 0x94, 0xd4, 0xdd, 0x5b, 0x4f, + 0x69, 0xa8, 0x04, 0x8e, 0x80, 0x87, 0x7d, 0x96, + 0x25, 0x37, 0x83, 0x0e, 0xca, 0xa4, 0xb3, 0x29, + 0x2f, 0x4b, 0x83, 0xa4, 0x01, 0x36, 0x0d, 0xdb, + 0xd7, 0x6e, 0x7a, 0x9c, 0x3e, 0x82, 0xc8, 0x5f, + 0x4e, 0xc6, 0xd2, 0x97, 0x64, 0xe6, 0xd9, 0x50, + 0x89, 0xcb, 0x64, 0x33, 0x28, 0x9c, 0x14, 0xf9, + 0x41, 0x33, 0x99, 0x0c, 0x87, 0x6f, 0x00, 0x3f, + 0x00, 0x6f, 0xae, 0xe9, 0x20, 0xc2, 0xcd, 0xb8, + 0x7a, 0x58, 0xde, 0x57, 0x34, 0xda, 0x63, 0xa1, + 0x0b, 0x55, 0xfc, 0x54, 0x2a, 0xed, 0xc0, 0xbc, + 0x29, 0x5f, 0x88, 0x7d, 0x37, 0x3b, 0x48, 0x86, + 0x3f, 0x88, 0xa2, 0xef, 0x55, 0xe6, 0xc4, 0xf8, + 0xb8, 0x11, 0x9e, 0x3a, 0x45, 0x79, 0xac, 0x85, + 0xb2, 0x70, 0x40, 0xd0, 0x66, 0xe7, 0x66, 0xc8, + 0x8e, 0x8f, 0xde, 0xde, 0xf8, 0x50, 0x79, 0x9e, + 0x37, 0x04, 0x07, 0x83, 0x5b, 0xe0, 0x68, 0x5b, + 0x32, 0xbc, 0x6e, 0x50, 0x05, 0xca, 0xf8, 0x3b, + 0xec, 0x15, 0x13, 0xf8, 0x9a, 0xa2, 0x58, 0x98, + 0x03, 0x29, 0x83, 0x7f, 0x11, 0xb4, 0x98, 0x41, + 0xc1, 0xd9, 0x02, 0x6e, 0x2c, 0x45, 0x55, 0xab, + 0xff, 0xcf, 0x23, 0x80, 0xf0, 0x82, 0x73, 0xe9, + 0xe6, 0x8f, 0x1a, 0xd9, 0x70, 0xd6, 0x46, 0x1f, + 0xa8, 0xf8, 0xbd, 0x14, 0xd9, 0x50, 0x59, 0x8e, + 0x46, 0xbf, 0xe2, 0x8a, 0x8e, 0xce, 0xe7, 0x81, + 0xf4, 0x3a, 0xd9, 0x07, 0xd8, 0x1d, 0x29, 0x19, + 0xc1, 0x9d, 0xac, 0x6f, 0xfb, 0xce, 0x95, 0x03, + 0x29, 0xce, 0x4a, 0x60, 0x34, 0x6a, 0x88, 0xc7, + 0x5e, 0x8c, 0x71, 0x29, 0x81, 0x64, 0x2f, 0xfb, + 0xb4, 0x20, 0x08, 0x57, 0xba, 0x50, 0x75, 0x7b, + 0x1e, 0xfa, 0xcc, 0x60, 0xe7, 0x09, 0xab, 0x4e, + 0x46, 0x64, 0xfe, 0x17, 0x00, 0x84, 0x8b, 0xca, + 0xa8, 0xcb, 0x18, 0x5b, 0xa2, 0x04, 0x13, 0x68, + 0x99, 0x02, 0xaf, 0xcb, 0x75, 0xcb, 0x46, 0x61, + 0x66, 0x05, 0xd9, 0x5c, 0x6d, 0x8c, 0xf9, 0x8a, + 0x57, 0xde, 0xf4, 0xb9, 0x5d, 0x51, 0x17, 0x4a, + 0x8c, 0x42, 0xca, 0x0d, 0x7f, 0x92, 0x69, 0x0d, + 0x88, 0x2b, 0xc6, 0xee, 0xbd, 0x5a, 0x32, 0x17, + 0x84, 0xef, 0xf9, 0xd9, 0x51, 0x33, 0x57, 0x2f, + 0x87, 0xf8, 0xda, 0x3c, 0x3c, 0x14, 0xa9, 0x26, + 0xad, 0x19, 0xfd, 0x14, 0x5e, 0x33, 0x92, 0xb1, + 0xe1, 0xd7, 0xfb, 0x1e, 0x55, 0x40, 0xe5, 0x80, + 0x9b, 0x8e, 0x4b, 0x88, 0x58, 0x77, 0xa9, 0xd2, + 0xbf, 0x40, 0x90, 0xbe, 0x8f, 0x1f, 0xa7, 0x8a, + 0xaf, 0x8e, 0x03, 0x93, 0x4d, 0x8a, 0x73, 0x8e, + 0x76, 0x67, 0x43, 0x37, 0xc1, 0x76, 0x87, 0x50, + 0x37, 0xc4, 0x02, 0x4a, 0x53, 0x1a, 0x5b, 0xe8, + 0x5f, 0xc8, 0x28, 0xad, 0xd3, 0x8a, 0x97, 0x53, + 0xa3, 0xf6, 0x48, 0xba, 0x05, 0x18, 0x56, 0x90, + 0xa9, 0x95, 0xd8, 0xac, 0xe9, 0xd5, 0x6c, 0xe3, + 0x1f, 0xd8, 0xfc, 0xc5, 0x27, 0x19, 0xab, 0x4a, + 0xc4, 0x36, 0xc9, 0xe9, 0xaa, 0x30, 0xef, 0x8e, + 0x9e, 0x01, 0x18, 0x68, 0xe9, 0x06, 0xf8, 0x54, + 0xe5, 0xe2, 0xec, 0xde, 0x52, 0xfc, 0x3b, 0xdd, + 0xe9, 0xc7, 0xc8, 0x2b, 0x93, 0xd4, 0xdb, 0x28, + 0x72, 0x06, 0x07, 0xd1, 0xba, 0x05, 0x23, 0xa6, + 0x41, 0x42, 0x55, 0x6a, 0x6e, 0x6f, 0x6c, 0x40, + 0x6a, 0x19, 0xa4, 0xd5, 0xa2, 0x11, 0xb5, 0x2b, + 0x16, 0x4a, 0xe3, 0x41, 0xf3, 0xaf, 0x93, 0xbd, + 0xc8, 0xd9, 0x26, 0x43, 0x71, 0x56, 0xd2, 0x5e, + 0xf5, 0xa8, 0x3c, 0x64, 0x83, 0x04, 0x89, 0x62, + 0x20, 0xd3, 0xe9, 0x8e, 0x60, 0xcd, 0xec, 0xd9, + 0xce, 0x89, 0xf0, 0x5c, 0xf2, 0x26, 0x72, 0x51, + 0xd5, 0x16, 0x7b, 0xef, 0x19, 0x10, 0xb4, 0xce, + 0x60, 0x47, 0xab, 0x98, 0x86, 0xbd, 0x39, 0xb7, + 0xc9, 0x29, 0x38, 0x1a, 0xc1, 0x5c, 0xab, 0x77, + 0xea, 0xe9, 0xf4, 0x7f, 0x6a, 0x06, 0xf7, 0xc0, + 0x0b, 0x17, 0x1f, 0x2f, 0xce, 0x07, 0x1b, 0x33, + 0x68, 0x4d, 0x64, 0x6a, 0x28, 0x6d, 0x1d, 0xc6, + 0x54, 0x5c, 0xa2, 0x69, 0xf9, 0xb4, 0x62, 0xc9, + 0x71, 0xf5, 0xd1, 0xb7, 0x7b, 0x02, 0x81, 0x6d, + 0x4b, 0x1f, 0x62, 0xc5, 0xce, 0x2e, 0xc6, 0x2a, + 0x1d, 0x6f, 0xc7, 0xc1, 0x99, 0x48, 0x7b, 0xc7, + 0xf3, 0x53, 0xb7, 0x02, 0x7f, 0x82, 0xda, 0xfa, + 0xce, 0xd3, 0x54, 0xf8, 0x9b, 0x30, 0x6f, 0xed, + 0x6c, 0xec, 0x1c, 0x21, 0x49, 0x04, 0x51, 0xae, + 0xd0, 0x3f, 0xb1, 0xfb, 0x78, 0x1a, 0x6f, 0x35, + 0xc8, 0x3f, 0x4c, 0x43, 0x71, 0xe9, 0xb8, 0xd7, + 0x74, 0xca, 0x46, 0x68, 0xeb, 0xd9, 0xa3, 0x94, + 0x6e, 0x9d, 0xea, 0x57, 0x22, 0x1e, 0x15, 0x27, + 0x40, 0xd4, 0x0c, 0x32, 0x40, 0xc0, 0x40, 0x8a, + 0x1e, 0x2e, 0x1a, 0x58, 0x84, 0xa0, 0xc3, 0x68, + 0x96, 0xfe, 0xb0, 0x96, 0x6c, 0x04, 0x61, 0x35, + 0x4a, 0x78, 0xc5, 0xeb, 0x50, 0xca, 0xcb, 0x22, + 0x7b, 0x53, 0x02, 0xfa, 0x63, 0x28, 0x10, 0x68, + 0x77, 0xab, 0xda, 0x7d, 0xd1, 0xc2, 0x3f, 0x95, + 0xa6, 0x5a, 0x92, 0x56, 0xb3, 0xb0, 0x29, 0x7e, + 0x0c, 0xb3, 0xc9, 0x39, 0x0f, 0x1f, 0x51, 0x9d + }; + + WC_ALLOC_VAR(plain, byte, CHACHA_LEN, NULL); + WC_ALLOC_VAR(cipher, byte, CHACHA_LEN, NULL); + + XMEMSET(plain, 0xa5, CHACHA_LEN); + for (i = 0; i < (int)sizeof(key); i++) { + key[i] = (byte)i; + } + for (i = 0; i < (int)sizeof(iv); i++) { + iv[i] = (byte)(i + 0x40); + } + + for (sz = 1; sz < CHACHA_LEN; sz++) { + ExpectIntEQ(wc_Chacha_SetKey(&enc, key, (word32)sizeof(key)), 0); + ExpectIntEQ(wc_Chacha_SetIV(&enc, iv, 0), 0); + + for (cnt = 0; cnt + sz <= CHACHA_LEN; cnt += sz) { + ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt, sz), + 0); + } + if (cnt < CHACHA_LEN) { + ExpectIntEQ(wc_Chacha_Process(&enc, cipher + cnt, plain + cnt, + CHACHA_LEN - cnt), 0); + } + ExpectBufEQ(cipher, expected, (int)sizeof(expected)); + } + + WC_FREE_VAR(plain, NULL); + WC_FREE_VAR(cipher, NULL); +#endif + return EXPECT_RESULT(); +} /* END test_wc_Chacha_Process */ + + diff --git a/tests/api/test_chacha.h b/tests/api/test_chacha.h index d9146775e..8403d25c0 100644 --- a/tests/api/test_chacha.h +++ b/tests/api/test_chacha.h @@ -26,9 +26,11 @@ int test_wc_Chacha_SetKey(void); int test_wc_Chacha_Process(void); +int test_wc_Chacha_Process_Chunking(void); -#define TEST_CHACHA_DECLS \ - TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \ - TEST_DECL_GROUP("chacha", test_wc_Chacha_Process) +#define TEST_CHACHA_DECLS \ + TEST_DECL_GROUP("chacha", test_wc_Chacha_SetKey), \ + TEST_DECL_GROUP("chacha", test_wc_Chacha_Process), \ + TEST_DECL_GROUP("chacha", test_wc_Chacha_Process_Chunking) #endif /* WOLFCRYPT_TEST_CHACHA_H */ diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm.S b/wolfcrypt/src/port/arm/armv8-chacha-asm.S index 94acf2871..b4f102a2a 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha-asm.S +++ b/wolfcrypt/src/port/arm/armv8-chacha-asm.S @@ -493,321 +493,233 @@ L_chacha_crypt_bytes_arm64_round_start_320: L_chacha_crypt_bytes_arm64_lt_320: cmp x3, #0x100 blt L_chacha_crypt_bytes_arm64_lt_256 - # Move state into regular register - mov x8, v16.d[0] - mov x10, v16.d[1] - mov x12, v17.d[0] - mov x14, v17.d[1] - mov x16, v18.d[0] - mov x19, v18.d[1] - mov x21, v19.d[0] - mov x23, v19.d[1] # Move state into vector registers - mov v0.16b, v16.16b - mov v1.16b, v17.16b - lsr x9, x8, #32 - mov v2.16b, v18.16b - add w5, w21, #1 - mov v3.16b, v19.16b - lsr x11, x10, #32 - mov v4.16b, v16.16b - mov v5.16b, v17.16b - lsr x13, x12, #32 - mov v6.16b, v18.16b - add w6, w21, #2 - mov v7.16b, v19.16b - lsr x15, x14, #32 - mov v8.16b, v16.16b - mov v9.16b, v17.16b - lsr x17, x16, #32 - mov v10.16b, v18.16b - add w21, w21, #3 - mov v11.16b, v19.16b - lsr x20, x19, #32 - mov v7.s[0], w5 - lsr x22, x21, #32 - mov v11.s[0], w6 - lsr x24, x23, #32 - add w7, w21, #1 + dup v0.4s, v16.s[0] + dup v1.4s, v16.s[1] + dup v2.4s, v16.s[2] + dup v3.4s, v16.s[3] + dup v4.4s, v17.s[0] + dup v5.4s, v17.s[1] + dup v6.4s, v17.s[2] + dup v7.4s, v17.s[3] + dup v8.4s, v18.s[0] + dup v9.4s, v18.s[1] + dup v10.4s, v18.s[2] + dup v11.4s, v18.s[3] + dup v12.4s, v19.s[0] + dup v13.4s, v19.s[1] + dup v14.4s, v19.s[2] + dup v15.4s, v19.s[3] + # Add to counter word + add v12.4s, v12.4s, v28.4s # Set number of odd+even rounds to perform mov x26, #10 L_chacha_crypt_bytes_arm64_round_start_256: subs x26, x26, #1 # Round odd # a += b; d ^= a; d <<<= 16; - add v0.4s, v0.4s, v1.4s - add w8, w8, w12 - add v4.4s, v4.4s, v5.4s - add w9, w9, w13 - add v8.4s, v8.4s, v9.4s - add w10, w10, w14 - eor v3.16b, v3.16b, v0.16b - add w11, w11, w15 - eor v7.16b, v7.16b, v4.16b - eor w21, w21, w8 - eor v11.16b, v11.16b, v8.16b - eor w22, w22, w9 - rev32 v3.8h, v3.8h - eor w23, w23, w10 - rev32 v7.8h, v7.8h - eor w24, w24, w11 - rev32 v11.8h, v11.8h - ror w21, w21, #16 + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h + rev32 v15.8h, v15.8h # c += d; b ^= c; b <<<= 12; - add v2.4s, v2.4s, v3.4s - ror w22, w22, #16 - add v6.4s, v6.4s, v7.4s - ror w23, w23, #16 - add v10.4s, v10.4s, v11.4s - ror w24, w24, #16 - eor v20.16b, v1.16b, v2.16b - add w16, w16, w21 - eor v21.16b, v5.16b, v6.16b - add w17, w17, w22 - eor v22.16b, v9.16b, v10.16b - add w19, w19, w23 - shl v1.4s, v20.4s, #12 - add w20, w20, w24 + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + eor v20.16b, v4.16b, v8.16b + eor v21.16b, v5.16b, v9.16b + eor v22.16b, v6.16b, v10.16b + eor v23.16b, v7.16b, v11.16b + shl v4.4s, v20.4s, #12 shl v5.4s, v21.4s, #12 - eor w12, w12, w16 - shl v9.4s, v22.4s, #12 - eor w13, w13, w17 - sri v1.4s, v20.4s, #20 - eor w14, w14, w19 + shl v6.4s, v22.4s, #12 + shl v7.4s, v23.4s, #12 + sri v4.4s, v20.4s, #20 sri v5.4s, v21.4s, #20 - eor w15, w15, w20 - sri v9.4s, v22.4s, #20 - ror w12, w12, #20 + sri v6.4s, v22.4s, #20 + sri v7.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; - add v0.4s, v0.4s, v1.4s - ror w13, w13, #20 - add v4.4s, v4.4s, v5.4s - ror w14, w14, #20 - add v8.4s, v8.4s, v9.4s - ror w15, w15, #20 - eor v3.16b, v3.16b, v0.16b - add w8, w8, w12 - eor v7.16b, v7.16b, v4.16b - add w9, w9, w13 - eor v11.16b, v11.16b, v8.16b - add w10, w10, w14 - tbl v3.16b, {v3.16b}, v30.16b - add w11, w11, w15 - tbl v7.16b, {v7.16b}, v30.16b - eor w21, w21, w8 - tbl v11.16b, {v11.16b}, v30.16b - eor w22, w22, w9 + add v0.4s, v0.4s, v4.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + eor v12.16b, v12.16b, v0.16b + eor v13.16b, v13.16b, v1.16b + eor v14.16b, v14.16b, v2.16b + eor v15.16b, v15.16b, v3.16b + tbl v12.16b, {v12.16b}, v30.16b + tbl v13.16b, {v13.16b}, v30.16b + tbl v14.16b, {v14.16b}, v30.16b + tbl v15.16b, {v15.16b}, v30.16b # c += d; b ^= c; b <<<= 7; - add v2.4s, v2.4s, v3.4s - eor w23, w23, w10 - add v6.4s, v6.4s, v7.4s - eor w24, w24, w11 - add v10.4s, v10.4s, v11.4s - ror w21, w21, #24 - eor v20.16b, v1.16b, v2.16b - ror w22, w22, #24 - eor v21.16b, v5.16b, v6.16b - ror w23, w23, #24 - eor v22.16b, v9.16b, v10.16b - ror w24, w24, #24 - shl v1.4s, v20.4s, #7 - add w16, w16, w21 + add v8.4s, v8.4s, v12.4s + add v9.4s, v9.4s, v13.4s + add v10.4s, v10.4s, v14.4s + add v11.4s, v11.4s, v15.4s + eor v20.16b, v4.16b, v8.16b + eor v21.16b, v5.16b, v9.16b + eor v22.16b, v6.16b, v10.16b + eor v23.16b, v7.16b, v11.16b + shl v4.4s, v20.4s, #7 shl v5.4s, v21.4s, #7 - add w17, w17, w22 - shl v9.4s, v22.4s, #7 - add w19, w19, w23 - sri v1.4s, v20.4s, #25 - add w20, w20, w24 + shl v6.4s, v22.4s, #7 + shl v7.4s, v23.4s, #7 + sri v4.4s, v20.4s, #25 sri v5.4s, v21.4s, #25 - eor w12, w12, w16 - sri v9.4s, v22.4s, #25 - eor w13, w13, w17 - ext v3.16b, v3.16b, v3.16b, #12 - eor w14, w14, w19 - ext v7.16b, v7.16b, v7.16b, #12 - eor w15, w15, w20 - ext v11.16b, v11.16b, v11.16b, #12 - ror w12, w12, #25 - ext v1.16b, v1.16b, v1.16b, #4 - ror w13, w13, #25 - ext v5.16b, v5.16b, v5.16b, #4 - ror w14, w14, #25 - ext v9.16b, v9.16b, v9.16b, #4 - ror w15, w15, #25 - ext v2.16b, v2.16b, v2.16b, #8 - ext v6.16b, v6.16b, v6.16b, #8 - ext v10.16b, v10.16b, v10.16b, #8 + sri v6.4s, v22.4s, #25 + sri v7.4s, v23.4s, #25 # Round even # a += b; d ^= a; d <<<= 16; - add v0.4s, v0.4s, v1.4s - add w8, w8, w13 - add v4.4s, v4.4s, v5.4s - add w9, w9, w14 - add v8.4s, v8.4s, v9.4s - add w10, w10, w15 - eor v3.16b, v3.16b, v0.16b - add w11, w11, w12 - eor v7.16b, v7.16b, v4.16b - eor w24, w24, w8 - eor v11.16b, v11.16b, v8.16b - eor w21, w21, w9 - rev32 v3.8h, v3.8h - eor w22, w22, w10 - rev32 v7.8h, v7.8h - eor w23, w23, w11 - rev32 v11.8h, v11.8h - ror w24, w24, #16 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + rev32 v15.8h, v15.8h + rev32 v12.8h, v12.8h + rev32 v13.8h, v13.8h + rev32 v14.8h, v14.8h # c += d; b ^= c; b <<<= 12; - add v2.4s, v2.4s, v3.4s - ror w21, w21, #16 - add v6.4s, v6.4s, v7.4s - ror w22, w22, #16 - add v10.4s, v10.4s, v11.4s - ror w23, w23, #16 - eor v20.16b, v1.16b, v2.16b - add w19, w19, w24 - eor v21.16b, v5.16b, v6.16b - add w20, w20, w21 - eor v22.16b, v9.16b, v10.16b - add w16, w16, w22 - shl v1.4s, v20.4s, #12 - add w17, w17, w23 - shl v5.4s, v21.4s, #12 - eor w13, w13, w19 - shl v9.4s, v22.4s, #12 - eor w14, w14, w20 - sri v1.4s, v20.4s, #20 - eor w15, w15, w16 - sri v5.4s, v21.4s, #20 - eor w12, w12, w17 - sri v9.4s, v22.4s, #20 - ror w13, w13, #20 + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + eor v20.16b, v5.16b, v10.16b + eor v21.16b, v6.16b, v11.16b + eor v22.16b, v7.16b, v8.16b + eor v23.16b, v4.16b, v9.16b + shl v5.4s, v20.4s, #12 + shl v6.4s, v21.4s, #12 + shl v7.4s, v22.4s, #12 + shl v4.4s, v23.4s, #12 + sri v5.4s, v20.4s, #20 + sri v6.4s, v21.4s, #20 + sri v7.4s, v22.4s, #20 + sri v4.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; - add v0.4s, v0.4s, v1.4s - ror w14, w14, #20 - add v4.4s, v4.4s, v5.4s - ror w15, w15, #20 - add v8.4s, v8.4s, v9.4s - ror w12, w12, #20 - eor v3.16b, v3.16b, v0.16b - add w8, w8, w13 - eor v7.16b, v7.16b, v4.16b - add w9, w9, w14 - eor v11.16b, v11.16b, v8.16b - add w10, w10, w15 - tbl v3.16b, {v3.16b}, v30.16b - add w11, w11, w12 - tbl v7.16b, {v7.16b}, v30.16b - eor w24, w24, w8 - tbl v11.16b, {v11.16b}, v30.16b - eor w21, w21, w9 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v4.4s + eor v15.16b, v15.16b, v0.16b + eor v12.16b, v12.16b, v1.16b + eor v13.16b, v13.16b, v2.16b + eor v14.16b, v14.16b, v3.16b + tbl v15.16b, {v15.16b}, v30.16b + tbl v12.16b, {v12.16b}, v30.16b + tbl v13.16b, {v13.16b}, v30.16b + tbl v14.16b, {v14.16b}, v30.16b # c += d; b ^= c; b <<<= 7; - add v2.4s, v2.4s, v3.4s - eor w22, w22, w10 - add v6.4s, v6.4s, v7.4s - eor w23, w23, w11 - add v10.4s, v10.4s, v11.4s - ror w24, w24, #24 - eor v20.16b, v1.16b, v2.16b - ror w21, w21, #24 - eor v21.16b, v5.16b, v6.16b - ror w22, w22, #24 - eor v22.16b, v9.16b, v10.16b - ror w23, w23, #24 - shl v1.4s, v20.4s, #7 - add w19, w19, w24 - shl v5.4s, v21.4s, #7 - add w20, w20, w21 - shl v9.4s, v22.4s, #7 - add w16, w16, w22 - sri v1.4s, v20.4s, #25 - add w17, w17, w23 - sri v5.4s, v21.4s, #25 - eor w13, w13, w19 - sri v9.4s, v22.4s, #25 - eor w14, w14, w20 - ext v3.16b, v3.16b, v3.16b, #4 - eor w15, w15, w16 - ext v7.16b, v7.16b, v7.16b, #4 - eor w12, w12, w17 - ext v11.16b, v11.16b, v11.16b, #4 - ror w13, w13, #25 - ext v1.16b, v1.16b, v1.16b, #12 - ror w14, w14, #25 - ext v5.16b, v5.16b, v5.16b, #12 - ror w15, w15, #25 - ext v9.16b, v9.16b, v9.16b, #12 - ror w12, w12, #25 - ext v2.16b, v2.16b, v2.16b, #8 - ext v6.16b, v6.16b, v6.16b, #8 - ext v10.16b, v10.16b, v10.16b, #8 + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v12.4s + add v8.4s, v8.4s, v13.4s + add v9.4s, v9.4s, v14.4s + eor v20.16b, v5.16b, v10.16b + eor v21.16b, v6.16b, v11.16b + eor v22.16b, v7.16b, v8.16b + eor v23.16b, v4.16b, v9.16b + shl v5.4s, v20.4s, #7 + shl v6.4s, v21.4s, #7 + shl v7.4s, v22.4s, #7 + shl v4.4s, v23.4s, #7 + sri v5.4s, v20.4s, #25 + sri v6.4s, v21.4s, #25 + sri v7.4s, v22.4s, #25 + sri v4.4s, v23.4s, #25 bne L_chacha_crypt_bytes_arm64_round_start_256 + mov x26, #4 + # Add counter now rather than after transposed + add v12.4s, v12.4s, v28.4s # Load message - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40 - # Add one (2 added during calculating vector results) - add w21, w21, #1 - # Add back state, XOR msg, store (load next block) - add v0.4s, v0.4s, v16.4s - add v1.4s, v1.4s, v17.4s - add v2.4s, v2.4s, v18.4s - add v3.4s, v3.4s, v19.4s - eor v0.16b, v0.16b, v20.16b - eor v1.16b, v1.16b, v21.16b - eor v2.16b, v2.16b, v22.16b - eor v3.16b, v3.16b, v23.16b - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40 - mov v19.s[0], w5 - add v4.4s, v4.4s, v16.4s - add v5.4s, v5.4s, v17.4s - add v6.4s, v6.4s, v18.4s - add v7.4s, v7.4s, v19.4s - eor v4.16b, v4.16b, v20.16b - eor v5.16b, v5.16b, v21.16b - eor v6.16b, v6.16b, v22.16b - eor v7.16b, v7.16b, v23.16b - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #0x40 - mov v19.s[0], w6 - add v8.4s, v8.4s, v16.4s - add v9.4s, v9.4s, v17.4s - add v10.4s, v10.4s, v18.4s - add v11.4s, v11.4s, v19.4s - eor v8.16b, v8.16b, v20.16b - eor v9.16b, v9.16b, v21.16b - eor v10.16b, v10.16b, v22.16b - eor v11.16b, v11.16b, v23.16b - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40 - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x1], #0x40 - # Move regular registers into vector registers for adding and xor - orr x8, x8, x9, lsl 32 - orr x10, x10, x11, lsl 32 - orr x12, x12, x13, lsl 32 - mov v0.d[0], x8 - orr x14, x14, x15, lsl 32 - mov v0.d[1], x10 - orr x16, x16, x17, lsl 32 - mov v1.d[0], x12 - orr x19, x19, x20, lsl 32 - mov v1.d[1], x14 - orr x21, x21, x22, lsl 32 - mov v2.d[0], x16 - orr x23, x23, x24, lsl 32 - mov v2.d[1], x19 - mov v3.d[0], x21 - mov v3.d[1], x23 - # Add back state, XOR in message and store - add v0.4s, v0.4s, v16.4s - add v1.4s, v1.4s, v17.4s - add v2.4s, v2.4s, v18.4s - add v3.4s, v3.4s, v19.4s - eor v0.16b, v0.16b, v20.16b - eor v1.16b, v1.16b, v21.16b - eor v2.16b, v2.16b, v22.16b - eor v3.16b, v3.16b, v23.16b - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40 - mov v19.d[0], x7 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 + # Transpose vectors + trn1 v20.4s, v0.4s, v1.4s + trn1 v22.4s, v2.4s, v3.4s + trn2 v21.4s, v0.4s, v1.4s + trn2 v23.4s, v2.4s, v3.4s + trn1 v0.2d, v20.2d, v22.2d + trn1 v1.2d, v21.2d, v23.2d + trn2 v2.2d, v20.2d, v22.2d + trn2 v3.2d, v21.2d, v23.2d + trn1 v20.4s, v4.4s, v5.4s + trn1 v22.4s, v6.4s, v7.4s + trn2 v21.4s, v4.4s, v5.4s + trn2 v23.4s, v6.4s, v7.4s + trn1 v4.2d, v20.2d, v22.2d + trn1 v5.2d, v21.2d, v23.2d + trn2 v6.2d, v20.2d, v22.2d + trn2 v7.2d, v21.2d, v23.2d + trn1 v20.4s, v8.4s, v9.4s + trn1 v22.4s, v10.4s, v11.4s + trn2 v21.4s, v8.4s, v9.4s + trn2 v23.4s, v10.4s, v11.4s + trn1 v8.2d, v20.2d, v22.2d + trn1 v9.2d, v21.2d, v23.2d + trn2 v10.2d, v20.2d, v22.2d + trn2 v11.2d, v21.2d, v23.2d + trn1 v20.4s, v12.4s, v13.4s + trn1 v22.4s, v14.4s, v15.4s + trn2 v21.4s, v12.4s, v13.4s + trn2 v23.4s, v14.4s, v15.4s + trn1 v12.2d, v20.2d, v22.2d + trn1 v13.2d, v21.2d, v23.2d + trn2 v14.2d, v20.2d, v22.2d + trn2 v15.2d, v21.2d, v23.2d + # Add back state, XOR in message and store (load next block) + add v20.4s, v0.4s, v16.4s + add v21.4s, v4.4s, v17.4s + add v22.4s, v8.4s, v18.4s + add v23.4s, v12.4s, v19.4s + eor v20.16b, v20.16b, v24.16b + eor v21.16b, v21.16b, v25.16b + eor v22.16b, v22.16b, v26.16b + eor v23.16b, v23.16b, v27.16b + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + add v20.4s, v1.4s, v16.4s + add v21.4s, v5.4s, v17.4s + add v22.4s, v9.4s, v18.4s + add v23.4s, v13.4s, v19.4s + eor v20.16b, v20.16b, v24.16b + eor v21.16b, v21.16b, v25.16b + eor v22.16b, v22.16b, v26.16b + eor v23.16b, v23.16b, v27.16b + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + add v20.4s, v2.4s, v16.4s + add v21.4s, v6.4s, v17.4s + add v22.4s, v10.4s, v18.4s + add v23.4s, v14.4s, v19.4s + eor v20.16b, v20.16b, v24.16b + eor v21.16b, v21.16b, v25.16b + eor v22.16b, v22.16b, v26.16b + eor v23.16b, v23.16b, v27.16b + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + add v20.4s, v3.4s, v16.4s + add v21.4s, v7.4s, v17.4s + add v22.4s, v11.4s, v18.4s + add v23.4s, v15.4s, v19.4s + eor v20.16b, v20.16b, v24.16b + eor v21.16b, v21.16b, v25.16b + eor v22.16b, v22.16b, v26.16b + eor v23.16b, v23.16b, v27.16b + st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 + mov v29.s[0], w26 sub x3, x3, #0x100 + add v19.4s, v19.4s, v29.4s # Done 256-byte block L_chacha_crypt_bytes_arm64_lt_256: cmp x3, #0x80 diff --git a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c index 9722546fb..fbf67c193 100644 --- a/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-chacha-asm_c.c @@ -439,42 +439,25 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len) "L_chacha_crypt_bytes_arm64_lt_320_%=: \n\t" "cmp %w[len], #0x100\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_256_%=\n\t" - /* Move state into regular register */ - "mov x8, v16.d[0]\n\t" - "mov x10, v16.d[1]\n\t" - "mov x12, v17.d[0]\n\t" - "mov x14, v17.d[1]\n\t" - "mov x16, v18.d[0]\n\t" - "mov x19, v18.d[1]\n\t" - "mov x21, v19.d[0]\n\t" - "mov x23, v19.d[1]\n\t" /* Move state into vector registers */ - "mov v0.16b, v16.16b\n\t" - "mov v1.16b, v17.16b\n\t" - "lsr x9, x8, #32\n\t" - "mov v2.16b, v18.16b\n\t" - "add %w[rol8], w21, #1\n\t" - "mov v3.16b, v19.16b\n\t" - "lsr x11, x10, #32\n\t" - "mov v4.16b, v16.16b\n\t" - "mov v5.16b, v17.16b\n\t" - "lsr x13, x12, #32\n\t" - "mov v6.16b, v18.16b\n\t" - "add %w[ctr], w21, #2\n\t" - "mov v7.16b, v19.16b\n\t" - "lsr x15, x14, #32\n\t" - "mov v8.16b, v16.16b\n\t" - "mov v9.16b, v17.16b\n\t" - "lsr x17, x16, #32\n\t" - "mov v10.16b, v18.16b\n\t" - "add w21, w21, #3\n\t" - "mov v11.16b, v19.16b\n\t" - "lsr x20, x19, #32\n\t" - "mov v7.s[0], %w[rol8]\n\t" - "lsr x22, x21, #32\n\t" - "mov v11.s[0], %w[ctr]\n\t" - "lsr x24, x23, #32\n\t" - "add w7, w21, #1\n\t" + "dup v0.4s, v16.s[0]\n\t" + "dup v1.4s, v16.s[1]\n\t" + "dup v2.4s, v16.s[2]\n\t" + "dup v3.4s, v16.s[3]\n\t" + "dup v4.4s, v17.s[0]\n\t" + "dup v5.4s, v17.s[1]\n\t" + "dup v6.4s, v17.s[2]\n\t" + "dup v7.4s, v17.s[3]\n\t" + "dup v8.4s, v18.s[0]\n\t" + "dup v9.4s, v18.s[1]\n\t" + "dup v10.4s, v18.s[2]\n\t" + "dup v11.4s, v18.s[3]\n\t" + "dup v12.4s, v19.s[0]\n\t" + "dup v13.4s, v19.s[1]\n\t" + "dup v14.4s, v19.s[2]\n\t" + "dup v15.4s, v19.s[3]\n\t" + /* Add to counter word */ + "add v12.4s, v12.4s, v28.4s\n\t" /* Set number of odd+even rounds to perform */ "mov x26, #10\n\t" "\n" @@ -482,279 +465,208 @@ void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len) "subs x26, x26, #1\n\t" /* Round odd */ /* a += b; d ^= a; d <<<= 16; */ - "add v0.4s, v0.4s, v1.4s\n\t" - "add w8, w8, w12\n\t" - "add v4.4s, v4.4s, v5.4s\n\t" - "add w9, w9, w13\n\t" - "add v8.4s, v8.4s, v9.4s\n\t" - "add w10, w10, w14\n\t" - "eor v3.16b, v3.16b, v0.16b\n\t" - "add w11, w11, w15\n\t" - "eor v7.16b, v7.16b, v4.16b\n\t" - "eor w21, w21, w8\n\t" - "eor v11.16b, v11.16b, v8.16b\n\t" - "eor w22, w22, w9\n\t" - "rev32 v3.8h, v3.8h\n\t" - "eor w23, w23, w10\n\t" - "rev32 v7.8h, v7.8h\n\t" - "eor w24, w24, w11\n\t" - "rev32 v11.8h, v11.8h\n\t" - "ror w21, w21, #16\n\t" + "add v0.4s, v0.4s, v4.4s\n\t" + "add v1.4s, v1.4s, v5.4s\n\t" + "add v2.4s, v2.4s, v6.4s\n\t" + "add v3.4s, v3.4s, v7.4s\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "rev32 v12.8h, v12.8h\n\t" + "rev32 v13.8h, v13.8h\n\t" + "rev32 v14.8h, v14.8h\n\t" + "rev32 v15.8h, v15.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ - "add v2.4s, v2.4s, v3.4s\n\t" - "ror w22, w22, #16\n\t" - "add v6.4s, v6.4s, v7.4s\n\t" - "ror w23, w23, #16\n\t" - "add v10.4s, v10.4s, v11.4s\n\t" - "ror w24, w24, #16\n\t" - "eor v20.16b, v1.16b, v2.16b\n\t" - "add w16, w16, w21\n\t" - "eor v21.16b, v5.16b, v6.16b\n\t" - "add w17, w17, w22\n\t" - "eor v22.16b, v9.16b, v10.16b\n\t" - "add w19, w19, w23\n\t" - "shl v1.4s, v20.4s, #12\n\t" - "add w20, w20, w24\n\t" + "add v8.4s, v8.4s, v12.4s\n\t" + "add v9.4s, v9.4s, v13.4s\n\t" + "add v10.4s, v10.4s, v14.4s\n\t" + "add v11.4s, v11.4s, v15.4s\n\t" + "eor v20.16b, v4.16b, v8.16b\n\t" + "eor v21.16b, v5.16b, v9.16b\n\t" + "eor v22.16b, v6.16b, v10.16b\n\t" + "eor v23.16b, v7.16b, v11.16b\n\t" + "shl v4.4s, v20.4s, #12\n\t" "shl v5.4s, v21.4s, #12\n\t" - "eor w12, w12, w16\n\t" - "shl v9.4s, v22.4s, #12\n\t" - "eor w13, w13, w17\n\t" - "sri v1.4s, v20.4s, #20\n\t" - "eor w14, w14, w19\n\t" + "shl v6.4s, v22.4s, #12\n\t" + "shl v7.4s, v23.4s, #12\n\t" + "sri v4.4s, v20.4s, #20\n\t" "sri v5.4s, v21.4s, #20\n\t" - "eor w15, w15, w20\n\t" - "sri v9.4s, v22.4s, #20\n\t" - "ror w12, w12, #20\n\t" + "sri v6.4s, v22.4s, #20\n\t" + "sri v7.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ - "add v0.4s, v0.4s, v1.4s\n\t" - "ror w13, w13, #20\n\t" - "add v4.4s, v4.4s, v5.4s\n\t" - "ror w14, w14, #20\n\t" - "add v8.4s, v8.4s, v9.4s\n\t" - "ror w15, w15, #20\n\t" - "eor v3.16b, v3.16b, v0.16b\n\t" - "add w8, w8, w12\n\t" - "eor v7.16b, v7.16b, v4.16b\n\t" - "add w9, w9, w13\n\t" - "eor v11.16b, v11.16b, v8.16b\n\t" - "add w10, w10, w14\n\t" - "tbl v3.16b, {v3.16b}, v30.16b\n\t" - "add w11, w11, w15\n\t" - "tbl v7.16b, {v7.16b}, v30.16b\n\t" - "eor w21, w21, w8\n\t" - "tbl v11.16b, {v11.16b}, v30.16b\n\t" - "eor w22, w22, w9\n\t" + "add v0.4s, v0.4s, v4.4s\n\t" + "add v1.4s, v1.4s, v5.4s\n\t" + "add v2.4s, v2.4s, v6.4s\n\t" + "add v3.4s, v3.4s, v7.4s\n\t" + "eor v12.16b, v12.16b, v0.16b\n\t" + "eor v13.16b, v13.16b, v1.16b\n\t" + "eor v14.16b, v14.16b, v2.16b\n\t" + "eor v15.16b, v15.16b, v3.16b\n\t" + "tbl v12.16b, {v12.16b}, v30.16b\n\t" + "tbl v13.16b, {v13.16b}, v30.16b\n\t" + "tbl v14.16b, {v14.16b}, v30.16b\n\t" + "tbl v15.16b, {v15.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ - "add v2.4s, v2.4s, v3.4s\n\t" - "eor w23, w23, w10\n\t" - "add v6.4s, v6.4s, v7.4s\n\t" - "eor w24, w24, w11\n\t" - "add v10.4s, v10.4s, v11.4s\n\t" - "ror w21, w21, #24\n\t" - "eor v20.16b, v1.16b, v2.16b\n\t" - "ror w22, w22, #24\n\t" - "eor v21.16b, v5.16b, v6.16b\n\t" - "ror w23, w23, #24\n\t" - "eor v22.16b, v9.16b, v10.16b\n\t" - "ror w24, w24, #24\n\t" - "shl v1.4s, v20.4s, #7\n\t" - "add w16, w16, w21\n\t" + "add v8.4s, v8.4s, v12.4s\n\t" + "add v9.4s, v9.4s, v13.4s\n\t" + "add v10.4s, v10.4s, v14.4s\n\t" + "add v11.4s, v11.4s, v15.4s\n\t" + "eor v20.16b, v4.16b, v8.16b\n\t" + "eor v21.16b, v5.16b, v9.16b\n\t" + "eor v22.16b, v6.16b, v10.16b\n\t" + "eor v23.16b, v7.16b, v11.16b\n\t" + "shl v4.4s, v20.4s, #7\n\t" "shl v5.4s, v21.4s, #7\n\t" - "add w17, w17, w22\n\t" - "shl v9.4s, v22.4s, #7\n\t" - "add w19, w19, w23\n\t" - "sri v1.4s, v20.4s, #25\n\t" - "add w20, w20, w24\n\t" + "shl v6.4s, v22.4s, #7\n\t" + "shl v7.4s, v23.4s, #7\n\t" + "sri v4.4s, v20.4s, #25\n\t" "sri v5.4s, v21.4s, #25\n\t" - "eor w12, w12, w16\n\t" - "sri v9.4s, v22.4s, #25\n\t" - "eor w13, w13, w17\n\t" - "ext v3.16b, v3.16b, v3.16b, #12\n\t" - "eor w14, w14, w19\n\t" - "ext v7.16b, v7.16b, v7.16b, #12\n\t" - "eor w15, w15, w20\n\t" - "ext v11.16b, v11.16b, v11.16b, #12\n\t" - "ror w12, w12, #25\n\t" - "ext v1.16b, v1.16b, v1.16b, #4\n\t" - "ror w13, w13, #25\n\t" - "ext v5.16b, v5.16b, v5.16b, #4\n\t" - "ror w14, w14, #25\n\t" - "ext v9.16b, v9.16b, v9.16b, #4\n\t" - "ror w15, w15, #25\n\t" - "ext v2.16b, v2.16b, v2.16b, #8\n\t" - "ext v6.16b, v6.16b, v6.16b, #8\n\t" - "ext v10.16b, v10.16b, v10.16b, #8\n\t" + "sri v6.4s, v22.4s, #25\n\t" + "sri v7.4s, v23.4s, #25\n\t" /* Round even */ /* a += b; d ^= a; d <<<= 16; */ - "add v0.4s, v0.4s, v1.4s\n\t" - "add w8, w8, w13\n\t" - "add v4.4s, v4.4s, v5.4s\n\t" - "add w9, w9, w14\n\t" - "add v8.4s, v8.4s, v9.4s\n\t" - "add w10, w10, w15\n\t" - "eor v3.16b, v3.16b, v0.16b\n\t" - "add w11, w11, w12\n\t" - "eor v7.16b, v7.16b, v4.16b\n\t" - "eor w24, w24, w8\n\t" - "eor v11.16b, v11.16b, v8.16b\n\t" - "eor w21, w21, w9\n\t" - "rev32 v3.8h, v3.8h\n\t" - "eor w22, w22, w10\n\t" - "rev32 v7.8h, v7.8h\n\t" - "eor w23, w23, w11\n\t" - "rev32 v11.8h, v11.8h\n\t" - "ror w24, w24, #16\n\t" + "add v0.4s, v0.4s, v5.4s\n\t" + "add v1.4s, v1.4s, v6.4s\n\t" + "add v2.4s, v2.4s, v7.4s\n\t" + "add v3.4s, v3.4s, v4.4s\n\t" + "eor v15.16b, v15.16b, v0.16b\n\t" + "eor v12.16b, v12.16b, v1.16b\n\t" + "eor v13.16b, v13.16b, v2.16b\n\t" + "eor v14.16b, v14.16b, v3.16b\n\t" + "rev32 v15.8h, v15.8h\n\t" + "rev32 v12.8h, v12.8h\n\t" + "rev32 v13.8h, v13.8h\n\t" + "rev32 v14.8h, v14.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ - "add v2.4s, v2.4s, v3.4s\n\t" - "ror w21, w21, #16\n\t" - "add v6.4s, v6.4s, v7.4s\n\t" - "ror w22, w22, #16\n\t" - "add v10.4s, v10.4s, v11.4s\n\t" - "ror w23, w23, #16\n\t" - "eor v20.16b, v1.16b, v2.16b\n\t" - "add w19, w19, w24\n\t" - "eor v21.16b, v5.16b, v6.16b\n\t" - "add w20, w20, w21\n\t" - "eor v22.16b, v9.16b, v10.16b\n\t" - "add w16, w16, w22\n\t" - "shl v1.4s, v20.4s, #12\n\t" - "add w17, w17, w23\n\t" - "shl v5.4s, v21.4s, #12\n\t" - "eor w13, w13, w19\n\t" - "shl v9.4s, v22.4s, #12\n\t" - "eor w14, w14, w20\n\t" - "sri v1.4s, v20.4s, #20\n\t" - "eor w15, w15, w16\n\t" - "sri v5.4s, v21.4s, #20\n\t" - "eor w12, w12, w17\n\t" - "sri v9.4s, v22.4s, #20\n\t" - "ror w13, w13, #20\n\t" + "add v10.4s, v10.4s, v15.4s\n\t" + "add v11.4s, v11.4s, v12.4s\n\t" + "add v8.4s, v8.4s, v13.4s\n\t" + "add v9.4s, v9.4s, v14.4s\n\t" + "eor v20.16b, v5.16b, v10.16b\n\t" + "eor v21.16b, v6.16b, v11.16b\n\t" + "eor v22.16b, v7.16b, v8.16b\n\t" + "eor v23.16b, v4.16b, v9.16b\n\t" + "shl v5.4s, v20.4s, #12\n\t" + "shl v6.4s, v21.4s, #12\n\t" + "shl v7.4s, v22.4s, #12\n\t" + "shl v4.4s, v23.4s, #12\n\t" + "sri v5.4s, v20.4s, #20\n\t" + "sri v6.4s, v21.4s, #20\n\t" + "sri v7.4s, v22.4s, #20\n\t" + "sri v4.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ - "add v0.4s, v0.4s, v1.4s\n\t" - "ror w14, w14, #20\n\t" - "add v4.4s, v4.4s, v5.4s\n\t" - "ror w15, w15, #20\n\t" - "add v8.4s, v8.4s, v9.4s\n\t" - "ror w12, w12, #20\n\t" - "eor v3.16b, v3.16b, v0.16b\n\t" - "add w8, w8, w13\n\t" - "eor v7.16b, v7.16b, v4.16b\n\t" - "add w9, w9, w14\n\t" - "eor v11.16b, v11.16b, v8.16b\n\t" - "add w10, w10, w15\n\t" - "tbl v3.16b, {v3.16b}, v30.16b\n\t" - "add w11, w11, w12\n\t" - "tbl v7.16b, {v7.16b}, v30.16b\n\t" - "eor w24, w24, w8\n\t" - "tbl v11.16b, {v11.16b}, v30.16b\n\t" - "eor w21, w21, w9\n\t" + "add v0.4s, v0.4s, v5.4s\n\t" + "add v1.4s, v1.4s, v6.4s\n\t" + "add v2.4s, v2.4s, v7.4s\n\t" + "add v3.4s, v3.4s, v4.4s\n\t" + "eor v15.16b, v15.16b, v0.16b\n\t" + "eor v12.16b, v12.16b, v1.16b\n\t" + "eor v13.16b, v13.16b, v2.16b\n\t" + "eor v14.16b, v14.16b, v3.16b\n\t" + "tbl v15.16b, {v15.16b}, v30.16b\n\t" + "tbl v12.16b, {v12.16b}, v30.16b\n\t" + "tbl v13.16b, {v13.16b}, v30.16b\n\t" + "tbl v14.16b, {v14.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ - "add v2.4s, v2.4s, v3.4s\n\t" - "eor w22, w22, w10\n\t" - "add v6.4s, v6.4s, v7.4s\n\t" - "eor w23, w23, w11\n\t" - "add v10.4s, v10.4s, v11.4s\n\t" - "ror w24, w24, #24\n\t" - "eor v20.16b, v1.16b, v2.16b\n\t" - "ror w21, w21, #24\n\t" - "eor v21.16b, v5.16b, v6.16b\n\t" - "ror w22, w22, #24\n\t" - "eor v22.16b, v9.16b, v10.16b\n\t" - "ror w23, w23, #24\n\t" - "shl v1.4s, v20.4s, #7\n\t" - "add w19, w19, w24\n\t" - "shl v5.4s, v21.4s, #7\n\t" - "add w20, w20, w21\n\t" - "shl v9.4s, v22.4s, #7\n\t" - "add w16, w16, w22\n\t" - "sri v1.4s, v20.4s, #25\n\t" - "add w17, w17, w23\n\t" - "sri v5.4s, v21.4s, #25\n\t" - "eor w13, w13, w19\n\t" - "sri v9.4s, v22.4s, #25\n\t" - "eor w14, w14, w20\n\t" - "ext v3.16b, v3.16b, v3.16b, #4\n\t" - "eor w15, w15, w16\n\t" - "ext v7.16b, v7.16b, v7.16b, #4\n\t" - "eor w12, w12, w17\n\t" - "ext v11.16b, v11.16b, v11.16b, #4\n\t" - "ror w13, w13, #25\n\t" - "ext v1.16b, v1.16b, v1.16b, #12\n\t" - "ror w14, w14, #25\n\t" - "ext v5.16b, v5.16b, v5.16b, #12\n\t" - "ror w15, w15, #25\n\t" - "ext v9.16b, v9.16b, v9.16b, #12\n\t" - "ror w12, w12, #25\n\t" - "ext v2.16b, v2.16b, v2.16b, #8\n\t" - "ext v6.16b, v6.16b, v6.16b, #8\n\t" - "ext v10.16b, v10.16b, v10.16b, #8\n\t" + "add v10.4s, v10.4s, v15.4s\n\t" + "add v11.4s, v11.4s, v12.4s\n\t" + "add v8.4s, v8.4s, v13.4s\n\t" + "add v9.4s, v9.4s, v14.4s\n\t" + "eor v20.16b, v5.16b, v10.16b\n\t" + "eor v21.16b, v6.16b, v11.16b\n\t" + "eor v22.16b, v7.16b, v8.16b\n\t" + "eor v23.16b, v4.16b, v9.16b\n\t" + "shl v5.4s, v20.4s, #7\n\t" + "shl v6.4s, v21.4s, #7\n\t" + "shl v7.4s, v22.4s, #7\n\t" + "shl v4.4s, v23.4s, #7\n\t" + "sri v5.4s, v20.4s, #25\n\t" + "sri v6.4s, v21.4s, #25\n\t" + "sri v7.4s, v22.4s, #25\n\t" + "sri v4.4s, v23.4s, #25\n\t" "b.ne L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t" + "mov x26, #4\n\t" + /* Add counter now rather than after transposed */ + "add v12.4s, v12.4s, v28.4s\n\t" /* Load message */ - "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t" - /* Add one (2 added during calculating vector results) */ - "add w21, w21, #1\n\t" - /* Add back state, XOR msg, store (load next block) */ - "add v0.4s, v0.4s, v16.4s\n\t" - "add v1.4s, v1.4s, v17.4s\n\t" - "add v2.4s, v2.4s, v18.4s\n\t" - "add v3.4s, v3.4s, v19.4s\n\t" - "eor v0.16b, v0.16b, v20.16b\n\t" - "eor v1.16b, v1.16b, v21.16b\n\t" - "eor v2.16b, v2.16b, v22.16b\n\t" - "eor v3.16b, v3.16b, v23.16b\n\t" - "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t" - "mov v19.s[0], %w[rol8]\n\t" - "add v4.4s, v4.4s, v16.4s\n\t" - "add v5.4s, v5.4s, v17.4s\n\t" - "add v6.4s, v6.4s, v18.4s\n\t" - "add v7.4s, v7.4s, v19.4s\n\t" - "eor v4.16b, v4.16b, v20.16b\n\t" - "eor v5.16b, v5.16b, v21.16b\n\t" - "eor v6.16b, v6.16b, v22.16b\n\t" - "eor v7.16b, v7.16b, v23.16b\n\t" - "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t" - "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%x[c]], #0x40\n\t" - "mov v19.s[0], %w[ctr]\n\t" - "add v8.4s, v8.4s, v16.4s\n\t" - "add v9.4s, v9.4s, v17.4s\n\t" - "add v10.4s, v10.4s, v18.4s\n\t" - "add v11.4s, v11.4s, v19.4s\n\t" - "eor v8.16b, v8.16b, v20.16b\n\t" - "eor v9.16b, v9.16b, v21.16b\n\t" - "eor v10.16b, v10.16b, v22.16b\n\t" - "eor v11.16b, v11.16b, v23.16b\n\t" - "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t" - "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%x[c]], #0x40\n\t" - /* Move regular registers into vector registers for adding and xor */ - "orr x8, x8, x9, lsl 32\n\t" - "orr x10, x10, x11, lsl 32\n\t" - "orr x12, x12, x13, lsl 32\n\t" - "mov v0.d[0], x8\n\t" - "orr x14, x14, x15, lsl 32\n\t" - "mov v0.d[1], x10\n\t" - "orr x16, x16, x17, lsl 32\n\t" - "mov v1.d[0], x12\n\t" - "orr x19, x19, x20, lsl 32\n\t" - "mov v1.d[1], x14\n\t" - "orr x21, x21, x22, lsl 32\n\t" - "mov v2.d[0], x16\n\t" - "orr x23, x23, x24, lsl 32\n\t" - "mov v2.d[1], x19\n\t" - "mov v3.d[0], x21\n\t" - "mov v3.d[1], x23\n\t" - /* Add back state, XOR in message and store */ - "add v0.4s, v0.4s, v16.4s\n\t" - "add v1.4s, v1.4s, v17.4s\n\t" - "add v2.4s, v2.4s, v18.4s\n\t" - "add v3.4s, v3.4s, v19.4s\n\t" - "eor v0.16b, v0.16b, v20.16b\n\t" - "eor v1.16b, v1.16b, v21.16b\n\t" - "eor v2.16b, v2.16b, v22.16b\n\t" - "eor v3.16b, v3.16b, v23.16b\n\t" - "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t" - "mov v19.d[0], x7\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" + /* Transpose vectors */ + "trn1 v20.4s, v0.4s, v1.4s\n\t" + "trn1 v22.4s, v2.4s, v3.4s\n\t" + "trn2 v21.4s, v0.4s, v1.4s\n\t" + "trn2 v23.4s, v2.4s, v3.4s\n\t" + "trn1 v0.2d, v20.2d, v22.2d\n\t" + "trn1 v1.2d, v21.2d, v23.2d\n\t" + "trn2 v2.2d, v20.2d, v22.2d\n\t" + "trn2 v3.2d, v21.2d, v23.2d\n\t" + "trn1 v20.4s, v4.4s, v5.4s\n\t" + "trn1 v22.4s, v6.4s, v7.4s\n\t" + "trn2 v21.4s, v4.4s, v5.4s\n\t" + "trn2 v23.4s, v6.4s, v7.4s\n\t" + "trn1 v4.2d, v20.2d, v22.2d\n\t" + "trn1 v5.2d, v21.2d, v23.2d\n\t" + "trn2 v6.2d, v20.2d, v22.2d\n\t" + "trn2 v7.2d, v21.2d, v23.2d\n\t" + "trn1 v20.4s, v8.4s, v9.4s\n\t" + "trn1 v22.4s, v10.4s, v11.4s\n\t" + "trn2 v21.4s, v8.4s, v9.4s\n\t" + "trn2 v23.4s, v10.4s, v11.4s\n\t" + "trn1 v8.2d, v20.2d, v22.2d\n\t" + "trn1 v9.2d, v21.2d, v23.2d\n\t" + "trn2 v10.2d, v20.2d, v22.2d\n\t" + "trn2 v11.2d, v21.2d, v23.2d\n\t" + "trn1 v20.4s, v12.4s, v13.4s\n\t" + "trn1 v22.4s, v14.4s, v15.4s\n\t" + "trn2 v21.4s, v12.4s, v13.4s\n\t" + "trn2 v23.4s, v14.4s, v15.4s\n\t" + "trn1 v12.2d, v20.2d, v22.2d\n\t" + "trn1 v13.2d, v21.2d, v23.2d\n\t" + "trn2 v14.2d, v20.2d, v22.2d\n\t" + "trn2 v15.2d, v21.2d, v23.2d\n\t" + /* Add back state, XOR in message and store (load next block) */ + "add v20.4s, v0.4s, v16.4s\n\t" + "add v21.4s, v4.4s, v17.4s\n\t" + "add v22.4s, v8.4s, v18.4s\n\t" + "add v23.4s, v12.4s, v19.4s\n\t" + "eor v20.16b, v20.16b, v24.16b\n\t" + "eor v21.16b, v21.16b, v25.16b\n\t" + "eor v22.16b, v22.16b, v26.16b\n\t" + "eor v23.16b, v23.16b, v27.16b\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" + "add v20.4s, v1.4s, v16.4s\n\t" + "add v21.4s, v5.4s, v17.4s\n\t" + "add v22.4s, v9.4s, v18.4s\n\t" + "add v23.4s, v13.4s, v19.4s\n\t" + "eor v20.16b, v20.16b, v24.16b\n\t" + "eor v21.16b, v21.16b, v25.16b\n\t" + "eor v22.16b, v22.16b, v26.16b\n\t" + "eor v23.16b, v23.16b, v27.16b\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" + "add v20.4s, v2.4s, v16.4s\n\t" + "add v21.4s, v6.4s, v17.4s\n\t" + "add v22.4s, v10.4s, v18.4s\n\t" + "add v23.4s, v14.4s, v19.4s\n\t" + "eor v20.16b, v20.16b, v24.16b\n\t" + "eor v21.16b, v21.16b, v25.16b\n\t" + "eor v22.16b, v22.16b, v26.16b\n\t" + "eor v23.16b, v23.16b, v27.16b\n\t" + "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" + "add v20.4s, v3.4s, v16.4s\n\t" + "add v21.4s, v7.4s, v17.4s\n\t" + "add v22.4s, v11.4s, v18.4s\n\t" + "add v23.4s, v15.4s, v19.4s\n\t" + "eor v20.16b, v20.16b, v24.16b\n\t" + "eor v21.16b, v21.16b, v25.16b\n\t" + "eor v22.16b, v22.16b, v26.16b\n\t" + "eor v23.16b, v23.16b, v27.16b\n\t" + "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" + "mov v29.s[0], w26\n\t" "sub %w[len], %w[len], #0x100\n\t" + "add v19.4s, v19.4s, v29.4s\n\t" /* Done 256-byte block */ "\n" "L_chacha_crypt_bytes_arm64_lt_256_%=: \n\t" diff --git a/wolfcrypt/src/port/arm/armv8-curve25519.S b/wolfcrypt/src/port/arm/armv8-curve25519.S index f7492d399..b3f0e31b4 100644 --- a/wolfcrypt/src/port/arm/armv8-curve25519.S +++ b/wolfcrypt/src/port/arm/armv8-curve25519.S @@ -1980,7 +1980,7 @@ _curve25519_base: add x2, x2, :lo12:L_curve25519_base_x2 #else adrp x2, L_curve25519_base_x2@PAGE - add x2, x2, :lo12:L_curve25519_base_x2@PAGEOFF + add x2, x2, L_curve25519_base_x2@PAGEOFF #endif /* __APPLE__ */ ldp x6, x7, [x2] ldp x8, x9, [x2, #16]