Chacha20 and poly1305 without x18 (#2454)

* Remove use of x18 and organize new optimizations

* Fix invalid operand
This commit is contained in:
julek-wolfssl
2019-09-08 16:03:04 -07:00
committed by GitHub
parent afb15f6521
commit 4c88d94d13
2 changed files with 135 additions and 139 deletions

View File

@ -237,7 +237,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"MOV x12, v26.d[0] \n\t"
"MOV x14, v26.d[1] \n\t"
"MOV x16, v27.d[0] \n\t"
"MOV x18, v27.d[1] \n\t"
"MOV x22, v27.d[1] \n\t"
/* Move state into vector registers (x4) */
"DUP v0.4s, v24.s[0] \n\t"
"DUP v1.4s, v24.s[1] \n\t"
@ -262,7 +262,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"LSR x17, x16, #32 \n\t"
"DUP v14.4s, v27.s[2] \n\t"
"DUP v15.4s, v27.s[3] \n\t"
"LSR x19, x18, #32 \n\t"
"LSR x19, x22, #32 \n\t"
/* Add to counter word */
"ADD v12.4s, v12.4s, v28.4s \n\t"
"ADD w16, w16, w21 \n\t"
@ -286,7 +286,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"EOR v13.16b, v13.16b, v1.16b \n\t"
"EOR w17, w17, w5 \n\t"
"EOR v14.16b, v14.16b, v2.16b \n\t"
"EOR w18, w18, w6 \n\t"
"EOR w22, w22, w6 \n\t"
"EOR v15.16b, v15.16b, v3.16b \n\t"
"EOR w19, w19, w7 \n\t"
"REV32 v12.8h, v12.8h \n\t"
@ -294,7 +294,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"REV32 v13.8h, v13.8h \n\t"
"ROR w17, w17, #16 \n\t"
"REV32 v14.8h, v14.8h \n\t"
"ROR w18, w18, #16 \n\t"
"ROR w22, w22, #16 \n\t"
"REV32 v15.8h, v15.8h \n\t"
"ROR w19, w19, #16 \n\t"
/* c += d; b ^= c; b <<<= 12; */
@ -303,7 +303,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"ADD v9.4s, v9.4s, v13.4s \n\t"
"ADD w13, w13, w17 \n\t"
"ADD v10.4s, v10.4s, v14.4s \n\t"
"ADD w14, w14, w18 \n\t"
"ADD w14, w14, w22 \n\t"
"ADD v11.4s, v11.4s, v15.4s \n\t"
"ADD w15, w15, w19 \n\t"
"EOR v16.16b, v4.16b, v8.16b \n\t"
@ -340,7 +340,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"EOR v13.16b, v13.16b, v1.16b \n\t"
"EOR w17, w17, w5 \n\t"
"EOR v14.16b, v14.16b, v2.16b \n\t"
"EOR w18, w18, w6 \n\t"
"EOR w22, w22, w6 \n\t"
"EOR v15.16b, v15.16b, v3.16b \n\t"
"EOR w19, w19, w7 \n\t"
"TBL v12.16b, { v12.16b }, v30.16b \n\t"
@ -348,7 +348,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"TBL v13.16b, { v13.16b }, v30.16b \n\t"
"ROR w17, w17, #24 \n\t"
"TBL v14.16b, { v14.16b }, v30.16b \n\t"
"ROR w18, w18, #24 \n\t"
"ROR w22, w22, #24 \n\t"
"TBL v15.16b, { v15.16b }, v30.16b \n\t"
"ROR w19, w19, #24 \n\t"
/* c += d; b ^= c; b <<<= 7; */
@ -357,7 +357,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"ADD v9.4s, v9.4s, v13.4s \n\t"
"ADD w13, w13, w17 \n\t"
"ADD v10.4s, v10.4s, v14.4s \n\t"
"ADD w14, w14, w18 \n\t"
"ADD w14, w14, w22 \n\t"
"ADD v11.4s, v11.4s, v15.4s \n\t"
"ADD w15, w15, w19 \n\t"
"EOR v16.16b, v4.16b, v8.16b \n\t"
@ -397,7 +397,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"EOR v13.16b, v13.16b, v2.16b \n\t"
"EOR w17, w17, w6 \n\t"
"EOR v14.16b, v14.16b, v3.16b \n\t"
"EOR w18, w18, w7 \n\t"
"EOR w22, w22, w7 \n\t"
"REV32 v15.8h, v15.8h \n\t"
"ROR w19, w19, #16 \n\t"
"REV32 v12.8h, v12.8h \n\t"
@ -405,7 +405,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"REV32 v13.8h, v13.8h \n\t"
"ROR w17, w17, #16 \n\t"
"REV32 v14.8h, v14.8h \n\t"
"ROR w18, w18, #16 \n\t"
"ROR w22, w22, #16 \n\t"
/* c += d; b ^= c; b <<<= 12; */
"ADD v10.4s, v10.4s, v15.4s \n\t"
"ADD w14, w14, w19 \n\t"
@ -414,7 +414,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"ADD v8.4s, v8.4s, v13.4s \n\t"
"ADD w12, w12, w17 \n\t"
"ADD v9.4s, v9.4s, v14.4s \n\t"
"ADD w13, w13, w18 \n\t"
"ADD w13, w13, w22 \n\t"
"EOR v16.16b, v5.16b, v10.16b \n\t"
"EOR w9, w9, w14 \n\t"
"EOR v17.16b, v6.16b, v11.16b \n\t"
@ -451,7 +451,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"EOR v13.16b, v13.16b, v2.16b \n\t"
"EOR w17, w17, w6 \n\t"
"EOR v14.16b, v14.16b, v3.16b \n\t"
"EOR w18, w18, w7 \n\t"
"EOR w22, w22, w7 \n\t"
"TBL v15.16b, { v15.16b }, v30.16b \n\t"
"ROR w19, w19, #24 \n\t"
"TBL v12.16b, { v12.16b }, v30.16b \n\t"
@ -459,7 +459,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"TBL v13.16b, { v13.16b }, v30.16b \n\t"
"ROR w17, w17, #24 \n\t"
"TBL v14.16b, { v14.16b }, v30.16b \n\t"
"ROR w18, w18, #24 \n\t"
"ROR w22, w22, #24 \n\t"
/* c += d; b ^= c; b <<<= 7; */
"ADD v10.4s, v10.4s, v15.4s \n\t"
"ADD w14, w14, w19 \n\t"
@ -468,7 +468,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"ADD v8.4s, v8.4s, v13.4s \n\t"
"ADD w12, w12, w17 \n\t"
"ADD v9.4s, v9.4s, v14.4s \n\t"
"ADD w13, w13, w18 \n\t"
"ADD w13, w13, w22 \n\t"
"EOR v16.16b, v5.16b, v10.16b \n\t"
"EOR w9, w9, w14 \n\t"
"EOR v17.16b, v6.16b, v11.16b \n\t"
@ -582,10 +582,10 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
"MOV v17.d[1], x10 \n\t"
"ORR x16, x16, x17, LSL #32 \n\t"
"MOV v18.d[0], x12 \n\t"
"ORR x18, x18, x19, LSL #32 \n\t"
"ORR x22, x22, x19, LSL #32 \n\t"
"MOV v18.d[1], x14 \n\t"
"MOV v19.d[0], x16 \n\t"
"MOV v19.d[1], x18 \n\t"
"MOV v19.d[1], x22 \n\t"
/* Add back state, XOR in message and store */
"ADD v16.4s, v16.4s, v24.4s \n\t"
"ADD v17.4s, v17.4s, v25.4s \n\t"
@ -606,7 +606,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
[L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
: "memory", "cc",
"x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
"x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21",
"x13", "x14", "x15", "x16", "x17", "x22", "x19", "x20", "x21",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"
@ -650,11 +650,11 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"MOV x12, v22.D[0] \n\t"
"MOV x14, v22.D[1] \n\t"
"MOV x16, v23.D[0] \n\t"
"MOV x18, v23.D[1] \n\t"
"MOV x22, v23.D[1] \n\t"
/* Move state into vector registers (x3) */
"MOV v0.16B, v20.16B \n\t"
"MOV v1.16B, v21.16B \n\t"
"LSR x19, x18, #32 \n\t"
"LSR x19, x22, #32 \n\t"
"MOV v2.16B, v22.16B \n\t"
"ADD w20, w16, #1 \n\t"
"MOV v3.16B, v23.16B \n\t"
@ -697,7 +697,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR v7.16B, v7.16B, v4.16B \n\t"
"EOR w17, w17, w5 \n\t"
"EOR v11.16B, v11.16B, v8.16B \n\t"
"EOR w18, w18, w6 \n\t"
"EOR w22, w22, w6 \n\t"
"REV32 v3.8H, v3.8H \n\t"
"EOR w19, w19, w7 \n\t"
"REV32 v7.8H, v7.8H \n\t"
@ -706,7 +706,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
/* c += d; b ^= c; b <<<= 12; */
"ROR w17, w17, #16 \n\t"
"ADD v2.4S, v2.4S, v3.4S \n\t"
"ROR w18, w18, #16 \n\t"
"ROR w22, w22, #16 \n\t"
"ADD v6.4S, v6.4S, v7.4S \n\t"
"ROR w19, w19, #16 \n\t"
"ADD v10.4S, v10.4S, v11.4S \n\t"
@ -714,7 +714,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR v12.16B, v1.16B, v2.16B \n\t"
"ADD w13, w13, w17 \n\t"
"EOR v13.16B, v5.16B, v6.16B \n\t"
"ADD w14, w14, w18 \n\t"
"ADD w14, w14, w22 \n\t"
"EOR v14.16B, v9.16B, v10.16B \n\t"
"ADD w15, w15, w19 \n\t"
"SHL v1.4S, v12.4S, #12 \n\t"
@ -748,7 +748,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR w17, w17, w5 \n\t"
"TBL v11.16B, { v11.16B }, v24.16B \n\t"
/* c += d; b ^= c; b <<<= 7; */
"EOR w18, w18, w6 \n\t"
"EOR w22, w22, w6 \n\t"
"ADD v2.4S, v2.4S, v3.4S \n\t"
"EOR w19, w19, w7 \n\t"
"ADD v6.4S, v6.4S, v7.4S \n\t"
@ -756,7 +756,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"ADD v10.4S, v10.4S, v11.4S \n\t"
"ROR w17, w17, #24 \n\t"
"EOR v12.16B, v1.16B, v2.16B \n\t"
"ROR w18, w18, #24 \n\t"
"ROR w22, w22, #24 \n\t"
"EOR v13.16B, v5.16B, v6.16B \n\t"
"ROR w19, w19, #24 \n\t"
"EOR v14.16B, v9.16B, v10.16B \n\t"
@ -764,7 +764,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"SHL v1.4S, v12.4S, #7 \n\t"
"ADD w13, w13, w17 \n\t"
"SHL v5.4S, v13.4S, #7 \n\t"
"ADD w14, w14, w18 \n\t"
"ADD w14, w14, w22 \n\t"
"SHL v9.4S, v14.4S, #7 \n\t"
"ADD w15, w15, w19 \n\t"
"SRI v1.4S, v12.4S, #25 \n\t"
@ -803,7 +803,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR v11.16B, v11.16B, v8.16B \n\t"
"EOR w17, w17, w6 \n\t"
"REV32 v3.8H, v3.8H \n\t"
"EOR w18, w18, w7 \n\t"
"EOR w22, w22, w7 \n\t"
"REV32 v7.8H, v7.8H \n\t"
"ROR w19, w19, #16 \n\t"
"REV32 v11.8H, v11.8H \n\t"
@ -812,7 +812,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"ADD v2.4S, v2.4S, v3.4S \n\t"
"ROR w17, w17, #16 \n\t"
"ADD v6.4S, v6.4S, v7.4S \n\t"
"ROR w18, w18, #16 \n\t"
"ROR w22, w22, #16 \n\t"
"ADD v10.4S, v10.4S, v11.4S \n\t"
"ADD w14, w14, w19 \n\t"
"EOR v12.16B, v1.16B, v2.16B \n\t"
@ -820,7 +820,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR v13.16B, v5.16B, v6.16B \n\t"
"ADD w12, w12, w17 \n\t"
"EOR v14.16B, v9.16B, v10.16B \n\t"
"ADD w13, w13, w18 \n\t"
"ADD w13, w13, w22 \n\t"
"SHL v1.4S, v12.4S, #12 \n\t"
"EOR w9, w9, w14 \n\t"
"SHL v5.4S, v13.4S, #12 \n\t"
@ -854,7 +854,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
/* c += d; b ^= c; b <<<= 7; */
"EOR w17, w17, w6 \n\t"
"ADD v2.4S, v2.4S, v3.4S \n\t"
"EOR w18, w18, w7 \n\t"
"EOR w22, w22, w7 \n\t"
"ADD v6.4S, v6.4S, v7.4S \n\t"
"ROR w19, w19, #24 \n\t"
"ADD v10.4S, v10.4S, v11.4S \n\t"
@ -862,7 +862,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"EOR v12.16B, v1.16B, v2.16B \n\t"
"ROR w17, w17, #24 \n\t"
"EOR v13.16B, v5.16B, v6.16B \n\t"
"ROR w18, w18, #24 \n\t"
"ROR w22, w22, #24 \n\t"
"EOR v14.16B, v9.16B, v10.16B \n\t"
"ADD w14, w14, w19 \n\t"
"SHL v1.4S, v12.4S, #7 \n\t"
@ -870,7 +870,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"SHL v5.4S, v13.4S, #7 \n\t"
"ADD w12, w12, w17 \n\t"
"SHL v9.4S, v14.4S, #7 \n\t"
"ADD w13, w13, w18 \n\t"
"ADD w13, w13, w22 \n\t"
"SRI v1.4S, v12.4S, #25 \n\t"
"EOR w9, w9, w14 \n\t"
"SRI v5.4S, v13.4S, #25 \n\t"
@ -942,10 +942,10 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
"MOV v13.D[1], x10 \n\t"
"ORR x16, x16, x17, lsl #32 \n\t"
"MOV v14.D[0], x12 \n\t"
"ORR x18, x18, x19, lsl #32 \n\t"
"ORR x22, x22, x19, lsl #32 \n\t"
"MOV v14.D[1], x14 \n\t"
"MOV v15.D[0], x16 \n\t"
"MOV v15.D[1], x18 \n\t"
"MOV v15.D[1], x22 \n\t"
/* Add back state, XOR in message and store */
"ADD v12.4S, v12.4S, v20.4S \n\t"
"ADD v13.4S, v13.4S, v21.4S \n\t"
@ -960,7 +960,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
: [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
"x10", "x11", "x12", "x13", "x14", "x15", "x16",
"x17", "x18", "x19", "x20", "x21", "v0", "v1",
"x17", "x22", "x19", "x20", "x21", "v0", "v1",
"v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20",

View File

@ -88,18 +88,18 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
/* t1 = U8TO64(&m[8]); */
"LDP x16, x17, [%[m]], #16 \n\t"
/* h0 += (U8TO32(m + 0)) & 0x3ffffff; */
"AND x18, x16, #0x3ffffff \n\t"
"ADD x2, x2, x18 \n\t"
"AND x26, x16, #0x3ffffff \n\t"
"ADD x2, x2, x26 \n\t"
/* h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff; */
"AND x18, x14, x16, LSR #26 \n\t"
"ADD x3, x3, x18 \n\t"
"AND x26, x14, x16, LSR #26 \n\t"
"ADD x3, x3, x26 \n\t"
/* h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff; */
"EXTR x18, x17, x16, #52 \n\t"
"AND x18, x18, #0x3ffffff \n\t"
"ADD x4, x4, x18 \n\t"
"EXTR x26, x17, x16, #52 \n\t"
"AND x26, x26, #0x3ffffff \n\t"
"ADD x4, x4, x26 \n\t"
/* h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff; */
"AND x18, x14, x17, LSR #14 \n\t"
"ADD x5, x5, x18 \n\t"
"AND x26, x14, x17, LSR #14 \n\t"
"ADD x5, x5, x26 \n\t"
/* h4 += (U8TO32(m + 12) >> 8) | hibit; */
"ORR x17, %[finished], x17, LSR #40 \n\t"
"ADD x6, x6, x17 \n\t"
@ -110,27 +110,27 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
/* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
"MUL x16, x2, x21 \n\t"
"MUL x17, x2, x22 \n\t"
"MUL x18, x2, x23 \n\t"
"MUL x26, x2, x23 \n\t"
"MUL x19, x2, x24 \n\t"
"MUL x20, x2, x25 \n\t"
"MADD x16, x3, x10, x16 \n\t"
"MADD x17, x3, x21, x17 \n\t"
"MADD x18, x3, x22, x18 \n\t"
"MADD x26, x3, x22, x26 \n\t"
"MADD x19, x3, x23, x19 \n\t"
"MADD x20, x3, x24, x20 \n\t"
"MADD x16, x4, x9, x16 \n\t"
"MADD x17, x4, x10, x17 \n\t"
"MADD x18, x4, x21, x18 \n\t"
"MADD x26, x4, x21, x26 \n\t"
"MADD x19, x4, x22, x19 \n\t"
"MADD x20, x4, x23, x20 \n\t"
"MADD x16, x5, x8, x16 \n\t"
"MADD x17, x5, x9, x17 \n\t"
"MADD x18, x5, x10, x18 \n\t"
"MADD x26, x5, x10, x26 \n\t"
"MADD x19, x5, x21, x19 \n\t"
"MADD x20, x5, x22, x20 \n\t"
"MADD x16, x6, x7, x16 \n\t"
"MADD x17, x6, x8, x17 \n\t"
"MADD x18, x6, x9, x18 \n\t"
"MADD x26, x6, x9, x26 \n\t"
"MADD x19, x6, x10, x19 \n\t"
"MADD x20, x6, x21, x20 \n\t"
/* d1 = d1 + d0 >> 26 */
@ -151,11 +151,11 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
"LSR x2, x20, #26 \n\t"
"AND x19, x19, #0x3ffffff \n\t"
"MADD x16, x2, x15, x16 \n\t"
"ADD x18, x18, x17, LSR #26 \n\t"
"ADD x26, x26, x17, LSR #26 \n\t"
"AND x17, x17, #0x3ffffff \n\t"
"AND x20, x20, #0x3ffffff \n\t"
"ADD x19, x19, x18, LSR #26 \n\t"
"AND x4, x18, #0x3ffffff \n\t"
"ADD x19, x19, x26, LSR #26 \n\t"
"AND x4, x26, #0x3ffffff \n\t"
"ADD x3, x17, x16, LSR #26 \n\t"
"AND x2, x16, #0x3ffffff \n\t"
"ADD x6, x20, x19, LSR #26 \n\t"
@ -182,8 +182,8 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
: "memory", "cc",
"w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w15",
"w21", "w22", "w23", "w24", "w25", "x2", "x3", "x4", "x5", "x6",
"x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x18", "x19",
"x20", "x21", "x22", "x23", "x24", "x25"
"x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x19", "x20",
"x21", "x22", "x23", "x24", "x25", "x26"
);
}
@ -200,15 +200,16 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV v27.D[0], x9 \n\t"
"LDR w24, [%[h], #16] \n\t"
"MOV v27.D[1], x9 \n\t"
"MOV x9, #5 \n\t"
"LSR x21, x20, #32 \n\t"
"MOV v28.D[0], x9 \n\t"
"DUP v29.4S, v27.S[0] \n\t"
"LSR x23, x22, #32 \n\t"
"MOV x9, #5 \n\t"
"AND x20, x20, #0x3ffffff \n\t"
"MOV v28.D[0], x9 \n\t"
"AND x22, x22, #0x3ffffff \n\t"
/* Zero accumulator registers */
"MOVI v15.2D, #0x0 \n\t"
"AND x20, x20, #0x3ffffff \n\t"
"MOVI v16.2D, #0x0 \n\t"
"AND x22, x22, #0x3ffffff \n\t"
"MOVI v17.2D, #0x0 \n\t"
"MOVI v18.2D, #0x0 \n\t"
"MOVI v19.2D, #0x0 \n\t"
@ -218,6 +219,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"LSL x9, x9, #24 \n\t"
"MOV v26.D[0], x9 \n\t"
"MOV v26.D[1], x9 \n\t"
"DUP v30.4S, v26.S[0] \n\t"
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
"BLO L_poly1305_64_start_block_size_64_%= \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
@ -254,14 +256,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV w15, v5.S[0] \n\t"
"MOV w16, v6.S[0] \n\t"
"MOV w17, v7.S[0] \n\t"
"MOV w18, v8.S[0] \n\t"
"MOV w8, v8.S[0] \n\t"
"MOV w19, v9.S[0] \n\t"
/* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
"DUP v29.4S, v27.S[0] \n\t"
"DUP v30.4S, v26.S[0] \n\t"
"USHR v14.4S, v13.4S, #8 \n\t"
"ORR v14.16B, v14.16B, v30.16B \n\t"
"SHL v13.4S, v13.4S, #18 \n\t"
@ -275,10 +275,6 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"AND v12.16B, v12.16B, v29.16B \n\t"
"AND v13.16B, v13.16B, v29.16B \n\t"
"AND v14.16B, v14.16B, v29.16B \n\t"
"MOV v27.S[1], wzr \n\t"
"MOV v27.S[3], wzr \n\t"
"MOV v26.S[1], wzr \n\t"
"MOV v26.S[3], wzr \n\t"
/* Four message blocks loaded */
/* Add messages to accumulator */
"ADD v15.2S, v15.2S, v10.2S \n\t"
@ -321,7 +317,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
"MADD x13, x21, x28, x13 \n\t"
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
"MADD x9, x22, x18, x9 \n\t"
"MADD x9, x22, x8, x9 \n\t"
"UMLAL v22.2D, v17.2S, v9.2S \n\t"
"MADD x10, x22, x19, x10 \n\t"
"UMLAL v23.2D, v17.2S, v0.2S \n\t"
@ -333,7 +329,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
"MADD x9, x23, x17, x9 \n\t"
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
"MADD x10, x23, x18, x10 \n\t"
"MADD x10, x23, x8, x10 \n\t"
"UMLAL v23.2D, v18.2S, v9.2S \n\t"
"MADD x11, x23, x19, x11 \n\t"
"UMLAL v24.2D, v18.2S, v0.2S \n\t"
@ -345,7 +341,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v22.2D, v19.2S, v7.2S \n\t"
"MADD x10, x24, x17, x10 \n\t"
"UMLAL v23.2D, v19.2S, v8.2S \n\t"
"MADD x11, x24, x18, x11 \n\t"
"MADD x11, x24, x8, x11 \n\t"
"UMLAL v24.2D, v19.2S, v9.2S \n\t"
"MADD x12, x24, x19, x12 \n\t"
"UMLAL v25.2D, v19.2S, v0.2S \n\t"
@ -460,52 +456,52 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Copy r^2 to lower half of registers */
"MOV v0.D[0], v0.D[1] \n\t"
"MOV v5.D[0], v5.D[1] \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"MOV v1.D[0], v1.D[1] \n\t"
"MOV v5.D[0], v5.D[1] \n\t"
"USHR v14.2D, v11.2D, #40 \n\t"
"MOV v6.D[0], v6.D[1] \n\t"
"MOV v1.D[0], v1.D[1] \n\t"
"ORR v14.16B, v14.16B, v26.16B \n\t"
"MOV v2.D[0], v2.D[1] \n\t"
"MOV v6.D[0], v6.D[1] \n\t"
"USHR v13.2D, v11.2D, #14 \n\t"
"MOV v7.D[0], v7.D[1] \n\t"
"MOV v2.D[0], v2.D[1] \n\t"
"AND v13.16B, v13.16B, v27.16B \n\t"
"MOV v3.D[0], v3.D[1] \n\t"
"MOV v7.D[0], v7.D[1] \n\t"
"SHL v12.2D, v11.2D, #12 \n\t"
"MOV v8.D[0], v8.D[1] \n\t"
"MOV v3.D[0], v3.D[1] \n\t"
"SRI v12.2D, v10.2D, #52 \n\t"
"MOV v4.D[0], v4.D[1] \n\t"
"MOV v8.D[0], v8.D[1] \n\t"
"AND v12.16B, v12.16B, v27.16B \n\t"
"MOV v9.D[0], v9.D[1] \n\t"
"MOV v4.D[0], v4.D[1] \n\t"
"USHR v11.2D, v10.2D, #26 \n\t"
"MOV v9.D[0], v9.D[1] \n\t"
"AND v11.16B, v11.16B, v27.16B \n\t"
/* Copy r^2 to ARM */
"MOV w25, v0.S[2] \n\t"
"AND v11.16B, v11.16B, v27.16B \n\t"
"MOV w26, v1.S[2] \n\t"
"AND v10.16B, v10.16B, v27.16B \n\t"
"MOV w27, v2.S[2] \n\t"
"MOV w26, v1.S[2] \n\t"
/* Two message blocks loaded */
/* Add last messages */
"ADD v21.2D, v21.2D, v10.2D \n\t"
"MOV w28, v3.S[2] \n\t"
"MOV w27, v2.S[2] \n\t"
"ADD v22.2D, v22.2D, v11.2D \n\t"
"MOV w30, v4.S[2] \n\t"
"MOV w28, v3.S[2] \n\t"
"ADD v23.2D, v23.2D, v12.2D \n\t"
"MOV w30, v4.S[2] \n\t"
"ADD v24.2D, v24.2D, v13.2D \n\t"
/* Copy 5*r^2 to ARM */
"MOV w15, v5.S[2] \n\t"
"ADD v24.2D, v24.2D, v13.2D \n\t"
"MOV w16, v6.S[2] \n\t"
"ADD v25.2D, v25.2D, v14.2D \n\t"
"MOV w17, v7.S[2] \n\t"
"MOV w16, v6.S[2] \n\t"
/* Reduce message to be ready for next multiplication */
/* Reduce radix 26 NEON */
/* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
/* with h3 -> h4 -> h0 -> h1 */
"USRA v22.2D, v21.2D, #26 \n\t"
"MOV w18, v8.S[2] \n\t"
"MOV w17, v7.S[2] \n\t"
"AND v21.16B, v21.16B, v27.16B \n\t"
"MOV w19, v9.S[2] \n\t"
"MOV w8, v8.S[2] \n\t"
"USRA v25.2D, v24.2D, #26 \n\t"
"MOV w19, v9.S[2] \n\t"
"AND v24.16B, v24.16B, v27.16B \n\t"
"USHR v15.2D, v25.2D, #26 \n\t"
"USRA v23.2D, v22.2D, #26 \n\t"
@ -555,7 +551,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV w15, v5.S[0] \n\t"
"MOV w16, v6.S[0] \n\t"
"MOV w17, v7.S[0] \n\t"
"MOV w18, v8.S[0] \n\t"
"MOV w8, v8.S[0] \n\t"
"MOV w19, v9.S[0] \n\t"
/* Load m */
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
@ -571,11 +567,11 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"USHR v11.2D, v10.2D, #26 \n\t"
"AND v11.16B, v11.16B, v27.16B \n\t"
"AND v10.16B, v10.16B, v27.16B \n\t"
"MOV v10.S[1], v10.S[2] \n\t"
"MOV v11.S[1], v11.S[2] \n\t"
"MOV v12.S[1], v12.S[2] \n\t"
"MOV v13.S[1], v13.S[2] \n\t"
"MOV v14.S[1], v14.S[2] \n\t"
"MOV v10.S[1], v10.S[2] \n\t"
"MOV v11.S[1], v11.S[2] \n\t"
"MOV v12.S[1], v12.S[2] \n\t"
"MOV v13.S[1], v13.S[2] \n\t"
"MOV v14.S[1], v14.S[2] \n\t"
/* Two message blocks loaded */
/* Add messages to accumulator */
"ADD v15.2S, v15.2S, v10.2S \n\t"
@ -618,7 +614,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
"MADD x13, x21, x28, x13 \n\t"
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
"MADD x9, x22, x18, x9 \n\t"
"MADD x9, x22, x8, x9 \n\t"
"UMLAL v22.2D, v17.2S, v9.2S \n\t"
"MADD x10, x22, x19, x10 \n\t"
"UMLAL v23.2D, v17.2S, v0.2S \n\t"
@ -630,7 +626,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
"MADD x9, x23, x17, x9 \n\t"
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
"MADD x10, x23, x18, x10 \n\t"
"MADD x10, x23, x8, x10 \n\t"
"UMLAL v23.2D, v18.2S, v9.2S \n\t"
"MADD x11, x23, x19, x11 \n\t"
"UMLAL v24.2D, v18.2S, v0.2S \n\t"
@ -642,7 +638,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v22.2D, v19.2S, v7.2S \n\t"
"MADD x10, x24, x17, x10 \n\t"
"UMLAL v23.2D, v19.2S, v8.2S \n\t"
"MADD x11, x24, x18, x11 \n\t"
"MADD x11, x24, x8, x11 \n\t"
"UMLAL v24.2D, v19.2S, v9.2S \n\t"
"MADD x12, x24, x19, x12 \n\t"
"UMLAL v25.2D, v19.2S, v0.2S \n\t"
@ -652,37 +648,37 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
/* Reduce h % P */
"MOV x14, #5 \n\t"
"ADD x10, x10, x9, LSR #26 \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"ADD x13, x13, x12, LSR #26 \n\t"
"ADD x10, x10, x9, LSR #26 \n\t"
"USHR v14.2D, v11.2D, #40 \n\t"
"AND x9, x9, #0x3ffffff \n\t"
"ADD x13, x13, x12, LSR #26 \n\t"
"ORR v14.16B, v14.16B, v26.16B \n\t"
"LSR x20, x13, #26 \n\t"
"AND x9, x9, #0x3ffffff \n\t"
"USHR v13.2D, v11.2D, #14 \n\t"
"AND x12, x12, #0x3ffffff \n\t"
"LSR x20, x13, #26 \n\t"
"AND v13.16B, v13.16B, v27.16B \n\t"
"MADD x9, x20, x14, x9 \n\t"
"AND x12, x12, #0x3ffffff \n\t"
"SHL v12.2D, v11.2D, #12 \n\t"
"ADD x11, x11, x10, LSR #26 \n\t"
"MADD x9, x20, x14, x9 \n\t"
"SRI v12.2D, v10.2D, #52 \n\t"
"AND x10, x10, #0x3ffffff \n\t"
"ADD x11, x11, x10, LSR #26 \n\t"
"AND v12.16B, v12.16B, v27.16B \n\t"
"AND x13, x13, #0x3ffffff \n\t"
"AND x10, x10, #0x3ffffff \n\t"
"USHR v11.2D, v10.2D, #26 \n\t"
"ADD x12, x12, x11, LSR #26 \n\t"
"AND x13, x13, #0x3ffffff \n\t"
"AND v11.16B, v11.16B, v27.16B \n\t"
"AND x22, x11, #0x3ffffff \n\t"
"ADD x12, x12, x11, LSR #26 \n\t"
"AND v10.16B, v10.16B, v27.16B \n\t"
"ADD x21, x10, x9, LSR #26 \n\t"
"AND x22, x11, #0x3ffffff \n\t"
/* Two message blocks loaded */
"ADD v21.2D, v21.2D, v10.2D \n\t"
"AND x20, x9, #0x3ffffff \n\t"
"ADD x21, x10, x9, LSR #26 \n\t"
"ADD v22.2D, v22.2D, v11.2D \n\t"
"ADD x24, x13, x12, LSR #26 \n\t"
"AND x20, x9, #0x3ffffff \n\t"
"ADD v23.2D, v23.2D, v12.2D \n\t"
"AND x23, x12, #0x3ffffff \n\t"
"ADD x24, x13, x12, LSR #26 \n\t"
"ADD v24.2D, v24.2D, v13.2D \n\t"
"AND x23, x12, #0x3ffffff \n\t"
"ADD v25.2D, v25.2D, v14.2D \n\t"
/* Reduce radix 26 NEON */
/* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
@ -752,7 +748,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMULL v22.2D, v15.2S, v1.2S \n\t"
"MADD x13, x21, x28, x13 \n\t"
"UMULL v23.2D, v15.2S, v2.2S \n\t"
"MADD x9, x22, x18, x9 \n\t"
"MADD x9, x22, x8, x9 \n\t"
"UMULL v24.2D, v15.2S, v3.2S \n\t"
"MADD x10, x22, x19, x10 \n\t"
"UMULL v25.2D, v15.2S, v4.2S \n\t"
@ -764,7 +760,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v23.2D, v16.2S, v1.2S \n\t"
"MADD x9, x23, x17, x9 \n\t"
"UMLAL v24.2D, v16.2S, v2.2S \n\t"
"MADD x10, x23, x18, x10 \n\t"
"MADD x10, x23, x8, x10 \n\t"
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
"MADD x11, x23, x19, x11 \n\t"
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
@ -776,7 +772,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL v24.2D, v17.2S, v1.2S \n\t"
"MADD x10, x24, x17, x10 \n\t"
"UMLAL v25.2D, v17.2S, v2.2S \n\t"
"MADD x11, x24, x18, x11 \n\t"
"MADD x11, x24, x8, x11 \n\t"
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
"MADD x12, x24, x19, x12 \n\t"
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
@ -866,12 +862,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "w9", "w10",
"w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20",
"w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "w30", "x9",
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
"v29", "v30"
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17",
"w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28",
"w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x30"
);
poly1305_blocks_16(ctx, m, bytes);
}
@ -950,42 +946,42 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
"MUL x15, x19, x20 \n\t"
"MUL x16, x19, x21 \n\t"
"MUL x17, x19, x22 \n\t"
"MUL x18, x19, x23 \n\t"
"MUL x7, x19, x23 \n\t"
"MADD x14, x20, x27, x14 \n\t"
"MADD x15, x20, x19, x15 \n\t"
"MADD x16, x20, x20, x16 \n\t"
"MADD x17, x20, x21, x17 \n\t"
"MADD x18, x20, x22, x18 \n\t"
"MADD x7, x20, x22, x7 \n\t"
"MADD x14, x21, x26, x14 \n\t"
"MADD x15, x21, x27, x15 \n\t"
"MADD x16, x21, x19, x16 \n\t"
"MADD x17, x21, x20, x17 \n\t"
"MADD x18, x21, x21, x18 \n\t"
"MADD x7, x21, x21, x7 \n\t"
"MADD x14, x22, x25, x14 \n\t"
"MADD x15, x22, x26, x15 \n\t"
"MADD x16, x22, x27, x16 \n\t"
"MADD x17, x22, x19, x17 \n\t"
"MADD x18, x22, x20, x18 \n\t"
"MADD x7, x22, x20, x7 \n\t"
"MADD x14, x23, x24, x14 \n\t"
"MADD x15, x23, x25, x15 \n\t"
"MADD x16, x23, x26, x16 \n\t"
"MADD x17, x23, x27, x17 \n\t"
"MADD x18, x23, x19, x18 \n\t"
"MADD x7, x23, x19, x7 \n\t"
/* r_2 = r^2 % P */
"ADD x15, x15, x14, LSR #26 \n\t"
"ADD x18, x18, x17, LSR #26 \n\t"
"ADD x7, x7, x17, LSR #26 \n\t"
"AND x14, x14, #0x3ffffff \n\t"
"LSR x9, x18, #26 \n\t"
"LSR x9, x7, #26 \n\t"
"AND x17, x17, #0x3ffffff \n\t"
"MADD x14, x9, x8, x14 \n\t"
"ADD x16, x16, x15, LSR #26 \n\t"
"AND x15, x15, #0x3ffffff \n\t"
"AND x18, x18, #0x3ffffff \n\t"
"AND x7, x7, #0x3ffffff \n\t"
"ADD x17, x17, x16, LSR #26 \n\t"
"AND x16, x16, #0x3ffffff \n\t"
"ADD x15, x15, x14, LSR #26 \n\t"
"AND x14, x14, #0x3ffffff \n\t"
"ADD x18, x18, x17, LSR #26 \n\t"
"ADD x7, x7, x17, LSR #26 \n\t"
"AND x17, x17, #0x3ffffff \n\t"
/* Store r */
"ORR x19, x19, x20, LSL #32 \n\t"
@ -996,7 +992,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
"MUL x24, x15, x8 \n\t"
"MUL x25, x16, x8 \n\t"
"MUL x26, x17, x8 \n\t"
"MUL x27, x18, x8 \n\t"
"MUL x27, x7, x8 \n\t"
/* Compute r^4 */
/* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
/* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
@ -1007,7 +1003,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
"MUL x20, x14, x15 \n\t"
"MUL x21, x14, x16 \n\t"
"MUL x22, x14, x17 \n\t"
"MUL x23, x14, x18 \n\t"
"MUL x23, x14, x7 \n\t"
"MADD x19, x15, x27, x19 \n\t"
"MADD x20, x15, x14, x20 \n\t"
"MADD x21, x15, x15, x21 \n\t"
@ -1023,11 +1019,11 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
"MADD x21, x17, x27, x21 \n\t"
"MADD x22, x17, x14, x22 \n\t"
"MADD x23, x17, x15, x23 \n\t"
"MADD x19, x18, x24, x19 \n\t"
"MADD x20, x18, x25, x20 \n\t"
"MADD x21, x18, x26, x21 \n\t"
"MADD x22, x18, x27, x22 \n\t"
"MADD x23, x18, x14, x23 \n\t"
"MADD x19, x7, x24, x19 \n\t"
"MADD x20, x7, x25, x20 \n\t"
"MADD x21, x7, x26, x21 \n\t"
"MADD x22, x7, x27, x22 \n\t"
"MADD x23, x7, x14, x23 \n\t"
/* r^4 % P */
"ADD x20, x20, x19, LSR #26 \n\t"
"ADD x23, x23, x22, LSR #26 \n\t"
@ -1048,7 +1044,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
"ORR x14, x14, x15, LSL #32 \n\t"
"ORR x16, x16, x17, LSL #32 \n\t"
"STP x14, x16, [%[ctx_r_2]] \n\t"
"STR w18, [%[ctx_r_2], #16] \n\t"
"STR w7, [%[ctx_r_2], #16] \n\t"
/* Store r^4 */
"ORR x19, x19, x20, LSL #32 \n\t"
"ORR x21, x21, x22, LSL #32 \n\t"
@ -1074,9 +1070,9 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
[ctx_leftover] "r" (&ctx->leftover),
[ctx_finished] "r" (&ctx->finished)
: "memory", "cc",
"w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
"x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
"w7", "w14", "w15", "w16", "w17", "w19", "w20", "w21", "w22", "w23",
"x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
return 0;