mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-07-29 18:27:29 +02:00
Chacha20 and poly1305 without x18 (#2454)
* Remove use of x18 and organize new optimizations * Fix invalid operand
This commit is contained in:
@ -237,7 +237,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"MOV x12, v26.d[0] \n\t"
|
||||
"MOV x14, v26.d[1] \n\t"
|
||||
"MOV x16, v27.d[0] \n\t"
|
||||
"MOV x18, v27.d[1] \n\t"
|
||||
"MOV x22, v27.d[1] \n\t"
|
||||
/* Move state into vector registers (x4) */
|
||||
"DUP v0.4s, v24.s[0] \n\t"
|
||||
"DUP v1.4s, v24.s[1] \n\t"
|
||||
@ -262,7 +262,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"LSR x17, x16, #32 \n\t"
|
||||
"DUP v14.4s, v27.s[2] \n\t"
|
||||
"DUP v15.4s, v27.s[3] \n\t"
|
||||
"LSR x19, x18, #32 \n\t"
|
||||
"LSR x19, x22, #32 \n\t"
|
||||
/* Add to counter word */
|
||||
"ADD v12.4s, v12.4s, v28.4s \n\t"
|
||||
"ADD w16, w16, w21 \n\t"
|
||||
@ -286,7 +286,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"EOR v13.16b, v13.16b, v1.16b \n\t"
|
||||
"EOR w17, w17, w5 \n\t"
|
||||
"EOR v14.16b, v14.16b, v2.16b \n\t"
|
||||
"EOR w18, w18, w6 \n\t"
|
||||
"EOR w22, w22, w6 \n\t"
|
||||
"EOR v15.16b, v15.16b, v3.16b \n\t"
|
||||
"EOR w19, w19, w7 \n\t"
|
||||
"REV32 v12.8h, v12.8h \n\t"
|
||||
@ -294,7 +294,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"REV32 v13.8h, v13.8h \n\t"
|
||||
"ROR w17, w17, #16 \n\t"
|
||||
"REV32 v14.8h, v14.8h \n\t"
|
||||
"ROR w18, w18, #16 \n\t"
|
||||
"ROR w22, w22, #16 \n\t"
|
||||
"REV32 v15.8h, v15.8h \n\t"
|
||||
"ROR w19, w19, #16 \n\t"
|
||||
/* c += d; b ^= c; b <<<= 12; */
|
||||
@ -303,7 +303,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"ADD v9.4s, v9.4s, v13.4s \n\t"
|
||||
"ADD w13, w13, w17 \n\t"
|
||||
"ADD v10.4s, v10.4s, v14.4s \n\t"
|
||||
"ADD w14, w14, w18 \n\t"
|
||||
"ADD w14, w14, w22 \n\t"
|
||||
"ADD v11.4s, v11.4s, v15.4s \n\t"
|
||||
"ADD w15, w15, w19 \n\t"
|
||||
"EOR v16.16b, v4.16b, v8.16b \n\t"
|
||||
@ -340,7 +340,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"EOR v13.16b, v13.16b, v1.16b \n\t"
|
||||
"EOR w17, w17, w5 \n\t"
|
||||
"EOR v14.16b, v14.16b, v2.16b \n\t"
|
||||
"EOR w18, w18, w6 \n\t"
|
||||
"EOR w22, w22, w6 \n\t"
|
||||
"EOR v15.16b, v15.16b, v3.16b \n\t"
|
||||
"EOR w19, w19, w7 \n\t"
|
||||
"TBL v12.16b, { v12.16b }, v30.16b \n\t"
|
||||
@ -348,7 +348,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"TBL v13.16b, { v13.16b }, v30.16b \n\t"
|
||||
"ROR w17, w17, #24 \n\t"
|
||||
"TBL v14.16b, { v14.16b }, v30.16b \n\t"
|
||||
"ROR w18, w18, #24 \n\t"
|
||||
"ROR w22, w22, #24 \n\t"
|
||||
"TBL v15.16b, { v15.16b }, v30.16b \n\t"
|
||||
"ROR w19, w19, #24 \n\t"
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
@ -357,7 +357,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"ADD v9.4s, v9.4s, v13.4s \n\t"
|
||||
"ADD w13, w13, w17 \n\t"
|
||||
"ADD v10.4s, v10.4s, v14.4s \n\t"
|
||||
"ADD w14, w14, w18 \n\t"
|
||||
"ADD w14, w14, w22 \n\t"
|
||||
"ADD v11.4s, v11.4s, v15.4s \n\t"
|
||||
"ADD w15, w15, w19 \n\t"
|
||||
"EOR v16.16b, v4.16b, v8.16b \n\t"
|
||||
@ -397,7 +397,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"EOR v13.16b, v13.16b, v2.16b \n\t"
|
||||
"EOR w17, w17, w6 \n\t"
|
||||
"EOR v14.16b, v14.16b, v3.16b \n\t"
|
||||
"EOR w18, w18, w7 \n\t"
|
||||
"EOR w22, w22, w7 \n\t"
|
||||
"REV32 v15.8h, v15.8h \n\t"
|
||||
"ROR w19, w19, #16 \n\t"
|
||||
"REV32 v12.8h, v12.8h \n\t"
|
||||
@ -405,7 +405,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"REV32 v13.8h, v13.8h \n\t"
|
||||
"ROR w17, w17, #16 \n\t"
|
||||
"REV32 v14.8h, v14.8h \n\t"
|
||||
"ROR w18, w18, #16 \n\t"
|
||||
"ROR w22, w22, #16 \n\t"
|
||||
/* c += d; b ^= c; b <<<= 12; */
|
||||
"ADD v10.4s, v10.4s, v15.4s \n\t"
|
||||
"ADD w14, w14, w19 \n\t"
|
||||
@ -414,7 +414,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"ADD v8.4s, v8.4s, v13.4s \n\t"
|
||||
"ADD w12, w12, w17 \n\t"
|
||||
"ADD v9.4s, v9.4s, v14.4s \n\t"
|
||||
"ADD w13, w13, w18 \n\t"
|
||||
"ADD w13, w13, w22 \n\t"
|
||||
"EOR v16.16b, v5.16b, v10.16b \n\t"
|
||||
"EOR w9, w9, w14 \n\t"
|
||||
"EOR v17.16b, v6.16b, v11.16b \n\t"
|
||||
@ -451,7 +451,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"EOR v13.16b, v13.16b, v2.16b \n\t"
|
||||
"EOR w17, w17, w6 \n\t"
|
||||
"EOR v14.16b, v14.16b, v3.16b \n\t"
|
||||
"EOR w18, w18, w7 \n\t"
|
||||
"EOR w22, w22, w7 \n\t"
|
||||
"TBL v15.16b, { v15.16b }, v30.16b \n\t"
|
||||
"ROR w19, w19, #24 \n\t"
|
||||
"TBL v12.16b, { v12.16b }, v30.16b \n\t"
|
||||
@ -459,7 +459,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"TBL v13.16b, { v13.16b }, v30.16b \n\t"
|
||||
"ROR w17, w17, #24 \n\t"
|
||||
"TBL v14.16b, { v14.16b }, v30.16b \n\t"
|
||||
"ROR w18, w18, #24 \n\t"
|
||||
"ROR w22, w22, #24 \n\t"
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
"ADD v10.4s, v10.4s, v15.4s \n\t"
|
||||
"ADD w14, w14, w19 \n\t"
|
||||
@ -468,7 +468,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"ADD v8.4s, v8.4s, v13.4s \n\t"
|
||||
"ADD w12, w12, w17 \n\t"
|
||||
"ADD v9.4s, v9.4s, v14.4s \n\t"
|
||||
"ADD w13, w13, w18 \n\t"
|
||||
"ADD w13, w13, w22 \n\t"
|
||||
"EOR v16.16b, v5.16b, v10.16b \n\t"
|
||||
"EOR w9, w9, w14 \n\t"
|
||||
"EOR v17.16b, v6.16b, v11.16b \n\t"
|
||||
@ -582,10 +582,10 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
"MOV v17.d[1], x10 \n\t"
|
||||
"ORR x16, x16, x17, LSL #32 \n\t"
|
||||
"MOV v18.d[0], x12 \n\t"
|
||||
"ORR x18, x18, x19, LSL #32 \n\t"
|
||||
"ORR x22, x22, x19, LSL #32 \n\t"
|
||||
"MOV v18.d[1], x14 \n\t"
|
||||
"MOV v19.d[0], x16 \n\t"
|
||||
"MOV v19.d[1], x18 \n\t"
|
||||
"MOV v19.d[1], x22 \n\t"
|
||||
/* Add back state, XOR in message and store */
|
||||
"ADD v16.4s, v16.4s, v24.4s \n\t"
|
||||
"ADD v17.4s, v17.4s, v25.4s \n\t"
|
||||
@ -606,7 +606,7 @@ static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m,
|
||||
[L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
|
||||
: "memory", "cc",
|
||||
"x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
|
||||
"x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21",
|
||||
"x13", "x14", "x15", "x16", "x17", "x22", "x19", "x20", "x21",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
||||
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
|
||||
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"
|
||||
@ -650,11 +650,11 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"MOV x12, v22.D[0] \n\t"
|
||||
"MOV x14, v22.D[1] \n\t"
|
||||
"MOV x16, v23.D[0] \n\t"
|
||||
"MOV x18, v23.D[1] \n\t"
|
||||
"MOV x22, v23.D[1] \n\t"
|
||||
/* Move state into vector registers (x3) */
|
||||
"MOV v0.16B, v20.16B \n\t"
|
||||
"MOV v1.16B, v21.16B \n\t"
|
||||
"LSR x19, x18, #32 \n\t"
|
||||
"LSR x19, x22, #32 \n\t"
|
||||
"MOV v2.16B, v22.16B \n\t"
|
||||
"ADD w20, w16, #1 \n\t"
|
||||
"MOV v3.16B, v23.16B \n\t"
|
||||
@ -697,7 +697,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR v7.16B, v7.16B, v4.16B \n\t"
|
||||
"EOR w17, w17, w5 \n\t"
|
||||
"EOR v11.16B, v11.16B, v8.16B \n\t"
|
||||
"EOR w18, w18, w6 \n\t"
|
||||
"EOR w22, w22, w6 \n\t"
|
||||
"REV32 v3.8H, v3.8H \n\t"
|
||||
"EOR w19, w19, w7 \n\t"
|
||||
"REV32 v7.8H, v7.8H \n\t"
|
||||
@ -706,7 +706,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
/* c += d; b ^= c; b <<<= 12; */
|
||||
"ROR w17, w17, #16 \n\t"
|
||||
"ADD v2.4S, v2.4S, v3.4S \n\t"
|
||||
"ROR w18, w18, #16 \n\t"
|
||||
"ROR w22, w22, #16 \n\t"
|
||||
"ADD v6.4S, v6.4S, v7.4S \n\t"
|
||||
"ROR w19, w19, #16 \n\t"
|
||||
"ADD v10.4S, v10.4S, v11.4S \n\t"
|
||||
@ -714,7 +714,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR v12.16B, v1.16B, v2.16B \n\t"
|
||||
"ADD w13, w13, w17 \n\t"
|
||||
"EOR v13.16B, v5.16B, v6.16B \n\t"
|
||||
"ADD w14, w14, w18 \n\t"
|
||||
"ADD w14, w14, w22 \n\t"
|
||||
"EOR v14.16B, v9.16B, v10.16B \n\t"
|
||||
"ADD w15, w15, w19 \n\t"
|
||||
"SHL v1.4S, v12.4S, #12 \n\t"
|
||||
@ -748,7 +748,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR w17, w17, w5 \n\t"
|
||||
"TBL v11.16B, { v11.16B }, v24.16B \n\t"
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
"EOR w18, w18, w6 \n\t"
|
||||
"EOR w22, w22, w6 \n\t"
|
||||
"ADD v2.4S, v2.4S, v3.4S \n\t"
|
||||
"EOR w19, w19, w7 \n\t"
|
||||
"ADD v6.4S, v6.4S, v7.4S \n\t"
|
||||
@ -756,7 +756,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"ADD v10.4S, v10.4S, v11.4S \n\t"
|
||||
"ROR w17, w17, #24 \n\t"
|
||||
"EOR v12.16B, v1.16B, v2.16B \n\t"
|
||||
"ROR w18, w18, #24 \n\t"
|
||||
"ROR w22, w22, #24 \n\t"
|
||||
"EOR v13.16B, v5.16B, v6.16B \n\t"
|
||||
"ROR w19, w19, #24 \n\t"
|
||||
"EOR v14.16B, v9.16B, v10.16B \n\t"
|
||||
@ -764,7 +764,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"SHL v1.4S, v12.4S, #7 \n\t"
|
||||
"ADD w13, w13, w17 \n\t"
|
||||
"SHL v5.4S, v13.4S, #7 \n\t"
|
||||
"ADD w14, w14, w18 \n\t"
|
||||
"ADD w14, w14, w22 \n\t"
|
||||
"SHL v9.4S, v14.4S, #7 \n\t"
|
||||
"ADD w15, w15, w19 \n\t"
|
||||
"SRI v1.4S, v12.4S, #25 \n\t"
|
||||
@ -803,7 +803,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR v11.16B, v11.16B, v8.16B \n\t"
|
||||
"EOR w17, w17, w6 \n\t"
|
||||
"REV32 v3.8H, v3.8H \n\t"
|
||||
"EOR w18, w18, w7 \n\t"
|
||||
"EOR w22, w22, w7 \n\t"
|
||||
"REV32 v7.8H, v7.8H \n\t"
|
||||
"ROR w19, w19, #16 \n\t"
|
||||
"REV32 v11.8H, v11.8H \n\t"
|
||||
@ -812,7 +812,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"ADD v2.4S, v2.4S, v3.4S \n\t"
|
||||
"ROR w17, w17, #16 \n\t"
|
||||
"ADD v6.4S, v6.4S, v7.4S \n\t"
|
||||
"ROR w18, w18, #16 \n\t"
|
||||
"ROR w22, w22, #16 \n\t"
|
||||
"ADD v10.4S, v10.4S, v11.4S \n\t"
|
||||
"ADD w14, w14, w19 \n\t"
|
||||
"EOR v12.16B, v1.16B, v2.16B \n\t"
|
||||
@ -820,7 +820,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR v13.16B, v5.16B, v6.16B \n\t"
|
||||
"ADD w12, w12, w17 \n\t"
|
||||
"EOR v14.16B, v9.16B, v10.16B \n\t"
|
||||
"ADD w13, w13, w18 \n\t"
|
||||
"ADD w13, w13, w22 \n\t"
|
||||
"SHL v1.4S, v12.4S, #12 \n\t"
|
||||
"EOR w9, w9, w14 \n\t"
|
||||
"SHL v5.4S, v13.4S, #12 \n\t"
|
||||
@ -854,7 +854,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
/* c += d; b ^= c; b <<<= 7; */
|
||||
"EOR w17, w17, w6 \n\t"
|
||||
"ADD v2.4S, v2.4S, v3.4S \n\t"
|
||||
"EOR w18, w18, w7 \n\t"
|
||||
"EOR w22, w22, w7 \n\t"
|
||||
"ADD v6.4S, v6.4S, v7.4S \n\t"
|
||||
"ROR w19, w19, #24 \n\t"
|
||||
"ADD v10.4S, v10.4S, v11.4S \n\t"
|
||||
@ -862,7 +862,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"EOR v12.16B, v1.16B, v2.16B \n\t"
|
||||
"ROR w17, w17, #24 \n\t"
|
||||
"EOR v13.16B, v5.16B, v6.16B \n\t"
|
||||
"ROR w18, w18, #24 \n\t"
|
||||
"ROR w22, w22, #24 \n\t"
|
||||
"EOR v14.16B, v9.16B, v10.16B \n\t"
|
||||
"ADD w14, w14, w19 \n\t"
|
||||
"SHL v1.4S, v12.4S, #7 \n\t"
|
||||
@ -870,7 +870,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"SHL v5.4S, v13.4S, #7 \n\t"
|
||||
"ADD w12, w12, w17 \n\t"
|
||||
"SHL v9.4S, v14.4S, #7 \n\t"
|
||||
"ADD w13, w13, w18 \n\t"
|
||||
"ADD w13, w13, w22 \n\t"
|
||||
"SRI v1.4S, v12.4S, #25 \n\t"
|
||||
"EOR w9, w9, w14 \n\t"
|
||||
"SRI v5.4S, v13.4S, #25 \n\t"
|
||||
@ -942,10 +942,10 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
"MOV v13.D[1], x10 \n\t"
|
||||
"ORR x16, x16, x17, lsl #32 \n\t"
|
||||
"MOV v14.D[0], x12 \n\t"
|
||||
"ORR x18, x18, x19, lsl #32 \n\t"
|
||||
"ORR x22, x22, x19, lsl #32 \n\t"
|
||||
"MOV v14.D[1], x14 \n\t"
|
||||
"MOV v15.D[0], x16 \n\t"
|
||||
"MOV v15.D[1], x18 \n\t"
|
||||
"MOV v15.D[1], x22 \n\t"
|
||||
/* Add back state, XOR in message and store */
|
||||
"ADD v12.4S, v12.4S, v20.4S \n\t"
|
||||
"ADD v13.4S, v13.4S, v21.4S \n\t"
|
||||
@ -960,7 +960,7 @@ static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS
|
||||
: [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
|
||||
: "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
|
||||
"x10", "x11", "x12", "x13", "x14", "x15", "x16",
|
||||
"x17", "x18", "x19", "x20", "x21", "v0", "v1",
|
||||
"x17", "x22", "x19", "x20", "x21", "v0", "v1",
|
||||
"v2", "v3", "v4", "v5", "v6", "v7", "v8",
|
||||
"v9", "v10", "v11", "v12", "v13", "v14",
|
||||
"v15", "v16", "v17", "v18", "v19", "v20",
|
||||
|
@ -88,18 +88,18 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
|
||||
/* t1 = U8TO64(&m[8]); */
|
||||
"LDP x16, x17, [%[m]], #16 \n\t"
|
||||
/* h0 += (U8TO32(m + 0)) & 0x3ffffff; */
|
||||
"AND x18, x16, #0x3ffffff \n\t"
|
||||
"ADD x2, x2, x18 \n\t"
|
||||
"AND x26, x16, #0x3ffffff \n\t"
|
||||
"ADD x2, x2, x26 \n\t"
|
||||
/* h1 += (U8TO32(m + 3) >> 2) & 0x3ffffff; */
|
||||
"AND x18, x14, x16, LSR #26 \n\t"
|
||||
"ADD x3, x3, x18 \n\t"
|
||||
"AND x26, x14, x16, LSR #26 \n\t"
|
||||
"ADD x3, x3, x26 \n\t"
|
||||
/* h2 += (U8TO32(m + 6) >> 4) & 0x3ffffff; */
|
||||
"EXTR x18, x17, x16, #52 \n\t"
|
||||
"AND x18, x18, #0x3ffffff \n\t"
|
||||
"ADD x4, x4, x18 \n\t"
|
||||
"EXTR x26, x17, x16, #52 \n\t"
|
||||
"AND x26, x26, #0x3ffffff \n\t"
|
||||
"ADD x4, x4, x26 \n\t"
|
||||
/* h3 += (U8TO32(m + 9) >> 6) & 0x3ffffff; */
|
||||
"AND x18, x14, x17, LSR #14 \n\t"
|
||||
"ADD x5, x5, x18 \n\t"
|
||||
"AND x26, x14, x17, LSR #14 \n\t"
|
||||
"ADD x5, x5, x26 \n\t"
|
||||
/* h4 += (U8TO32(m + 12) >> 8) | hibit; */
|
||||
"ORR x17, %[finished], x17, LSR #40 \n\t"
|
||||
"ADD x6, x6, x17 \n\t"
|
||||
@ -110,27 +110,27 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
|
||||
/* d4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0 */
|
||||
"MUL x16, x2, x21 \n\t"
|
||||
"MUL x17, x2, x22 \n\t"
|
||||
"MUL x18, x2, x23 \n\t"
|
||||
"MUL x26, x2, x23 \n\t"
|
||||
"MUL x19, x2, x24 \n\t"
|
||||
"MUL x20, x2, x25 \n\t"
|
||||
"MADD x16, x3, x10, x16 \n\t"
|
||||
"MADD x17, x3, x21, x17 \n\t"
|
||||
"MADD x18, x3, x22, x18 \n\t"
|
||||
"MADD x26, x3, x22, x26 \n\t"
|
||||
"MADD x19, x3, x23, x19 \n\t"
|
||||
"MADD x20, x3, x24, x20 \n\t"
|
||||
"MADD x16, x4, x9, x16 \n\t"
|
||||
"MADD x17, x4, x10, x17 \n\t"
|
||||
"MADD x18, x4, x21, x18 \n\t"
|
||||
"MADD x26, x4, x21, x26 \n\t"
|
||||
"MADD x19, x4, x22, x19 \n\t"
|
||||
"MADD x20, x4, x23, x20 \n\t"
|
||||
"MADD x16, x5, x8, x16 \n\t"
|
||||
"MADD x17, x5, x9, x17 \n\t"
|
||||
"MADD x18, x5, x10, x18 \n\t"
|
||||
"MADD x26, x5, x10, x26 \n\t"
|
||||
"MADD x19, x5, x21, x19 \n\t"
|
||||
"MADD x20, x5, x22, x20 \n\t"
|
||||
"MADD x16, x6, x7, x16 \n\t"
|
||||
"MADD x17, x6, x8, x17 \n\t"
|
||||
"MADD x18, x6, x9, x18 \n\t"
|
||||
"MADD x26, x6, x9, x26 \n\t"
|
||||
"MADD x19, x6, x10, x19 \n\t"
|
||||
"MADD x20, x6, x21, x20 \n\t"
|
||||
/* d1 = d1 + d0 >> 26 */
|
||||
@ -151,11 +151,11 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
|
||||
"LSR x2, x20, #26 \n\t"
|
||||
"AND x19, x19, #0x3ffffff \n\t"
|
||||
"MADD x16, x2, x15, x16 \n\t"
|
||||
"ADD x18, x18, x17, LSR #26 \n\t"
|
||||
"ADD x26, x26, x17, LSR #26 \n\t"
|
||||
"AND x17, x17, #0x3ffffff \n\t"
|
||||
"AND x20, x20, #0x3ffffff \n\t"
|
||||
"ADD x19, x19, x18, LSR #26 \n\t"
|
||||
"AND x4, x18, #0x3ffffff \n\t"
|
||||
"ADD x19, x19, x26, LSR #26 \n\t"
|
||||
"AND x4, x26, #0x3ffffff \n\t"
|
||||
"ADD x3, x17, x16, LSR #26 \n\t"
|
||||
"AND x2, x16, #0x3ffffff \n\t"
|
||||
"ADD x6, x20, x19, LSR #26 \n\t"
|
||||
@ -182,8 +182,8 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
|
||||
: "memory", "cc",
|
||||
"w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w15",
|
||||
"w21", "w22", "w23", "w24", "w25", "x2", "x3", "x4", "x5", "x6",
|
||||
"x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x18", "x19",
|
||||
"x20", "x21", "x22", "x23", "x24", "x25"
|
||||
"x7", "x8", "x9", "x10", "x14", "x15", "x16", "x17", "x19", "x20",
|
||||
"x21", "x22", "x23", "x24", "x25", "x26"
|
||||
);
|
||||
}
|
||||
|
||||
@ -200,15 +200,16 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"MOV v27.D[0], x9 \n\t"
|
||||
"LDR w24, [%[h], #16] \n\t"
|
||||
"MOV v27.D[1], x9 \n\t"
|
||||
"MOV x9, #5 \n\t"
|
||||
"LSR x21, x20, #32 \n\t"
|
||||
"MOV v28.D[0], x9 \n\t"
|
||||
"DUP v29.4S, v27.S[0] \n\t"
|
||||
"LSR x23, x22, #32 \n\t"
|
||||
"MOV x9, #5 \n\t"
|
||||
"AND x20, x20, #0x3ffffff \n\t"
|
||||
"MOV v28.D[0], x9 \n\t"
|
||||
"AND x22, x22, #0x3ffffff \n\t"
|
||||
/* Zero accumulator registers */
|
||||
"MOVI v15.2D, #0x0 \n\t"
|
||||
"AND x20, x20, #0x3ffffff \n\t"
|
||||
"MOVI v16.2D, #0x0 \n\t"
|
||||
"AND x22, x22, #0x3ffffff \n\t"
|
||||
"MOVI v17.2D, #0x0 \n\t"
|
||||
"MOVI v18.2D, #0x0 \n\t"
|
||||
"MOVI v19.2D, #0x0 \n\t"
|
||||
@ -218,6 +219,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"LSL x9, x9, #24 \n\t"
|
||||
"MOV v26.D[0], x9 \n\t"
|
||||
"MOV v26.D[1], x9 \n\t"
|
||||
"DUP v30.4S, v26.S[0] \n\t"
|
||||
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
|
||||
"BLO L_poly1305_64_start_block_size_64_%= \n\t"
|
||||
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
|
||||
@ -254,14 +256,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"MOV w15, v5.S[0] \n\t"
|
||||
"MOV w16, v6.S[0] \n\t"
|
||||
"MOV w17, v7.S[0] \n\t"
|
||||
"MOV w18, v8.S[0] \n\t"
|
||||
"MOV w8, v8.S[0] \n\t"
|
||||
"MOV w19, v9.S[0] \n\t"
|
||||
/* Load m */
|
||||
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
|
||||
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
|
||||
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
|
||||
"DUP v29.4S, v27.S[0] \n\t"
|
||||
"DUP v30.4S, v26.S[0] \n\t"
|
||||
"USHR v14.4S, v13.4S, #8 \n\t"
|
||||
"ORR v14.16B, v14.16B, v30.16B \n\t"
|
||||
"SHL v13.4S, v13.4S, #18 \n\t"
|
||||
@ -275,10 +275,6 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"AND v12.16B, v12.16B, v29.16B \n\t"
|
||||
"AND v13.16B, v13.16B, v29.16B \n\t"
|
||||
"AND v14.16B, v14.16B, v29.16B \n\t"
|
||||
"MOV v27.S[1], wzr \n\t"
|
||||
"MOV v27.S[3], wzr \n\t"
|
||||
"MOV v26.S[1], wzr \n\t"
|
||||
"MOV v26.S[3], wzr \n\t"
|
||||
/* Four message blocks loaded */
|
||||
/* Add messages to accumulator */
|
||||
"ADD v15.2S, v15.2S, v10.2S \n\t"
|
||||
@ -321,7 +317,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
|
||||
"MADD x13, x21, x28, x13 \n\t"
|
||||
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
|
||||
"MADD x9, x22, x18, x9 \n\t"
|
||||
"MADD x9, x22, x8, x9 \n\t"
|
||||
"UMLAL v22.2D, v17.2S, v9.2S \n\t"
|
||||
"MADD x10, x22, x19, x10 \n\t"
|
||||
"UMLAL v23.2D, v17.2S, v0.2S \n\t"
|
||||
@ -333,7 +329,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
|
||||
"MADD x9, x23, x17, x9 \n\t"
|
||||
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
|
||||
"MADD x10, x23, x18, x10 \n\t"
|
||||
"MADD x10, x23, x8, x10 \n\t"
|
||||
"UMLAL v23.2D, v18.2S, v9.2S \n\t"
|
||||
"MADD x11, x23, x19, x11 \n\t"
|
||||
"UMLAL v24.2D, v18.2S, v0.2S \n\t"
|
||||
@ -345,7 +341,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v22.2D, v19.2S, v7.2S \n\t"
|
||||
"MADD x10, x24, x17, x10 \n\t"
|
||||
"UMLAL v23.2D, v19.2S, v8.2S \n\t"
|
||||
"MADD x11, x24, x18, x11 \n\t"
|
||||
"MADD x11, x24, x8, x11 \n\t"
|
||||
"UMLAL v24.2D, v19.2S, v9.2S \n\t"
|
||||
"MADD x12, x24, x19, x12 \n\t"
|
||||
"UMLAL v25.2D, v19.2S, v0.2S \n\t"
|
||||
@ -460,52 +456,52 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
|
||||
/* Copy r^2 to lower half of registers */
|
||||
"MOV v0.D[0], v0.D[1] \n\t"
|
||||
"MOV v5.D[0], v5.D[1] \n\t"
|
||||
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
|
||||
"MOV v1.D[0], v1.D[1] \n\t"
|
||||
"MOV v5.D[0], v5.D[1] \n\t"
|
||||
"USHR v14.2D, v11.2D, #40 \n\t"
|
||||
"MOV v6.D[0], v6.D[1] \n\t"
|
||||
"MOV v1.D[0], v1.D[1] \n\t"
|
||||
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
||||
"MOV v2.D[0], v2.D[1] \n\t"
|
||||
"MOV v6.D[0], v6.D[1] \n\t"
|
||||
"USHR v13.2D, v11.2D, #14 \n\t"
|
||||
"MOV v7.D[0], v7.D[1] \n\t"
|
||||
"MOV v2.D[0], v2.D[1] \n\t"
|
||||
"AND v13.16B, v13.16B, v27.16B \n\t"
|
||||
"MOV v3.D[0], v3.D[1] \n\t"
|
||||
"MOV v7.D[0], v7.D[1] \n\t"
|
||||
"SHL v12.2D, v11.2D, #12 \n\t"
|
||||
"MOV v8.D[0], v8.D[1] \n\t"
|
||||
"MOV v3.D[0], v3.D[1] \n\t"
|
||||
"SRI v12.2D, v10.2D, #52 \n\t"
|
||||
"MOV v4.D[0], v4.D[1] \n\t"
|
||||
"MOV v8.D[0], v8.D[1] \n\t"
|
||||
"AND v12.16B, v12.16B, v27.16B \n\t"
|
||||
"MOV v9.D[0], v9.D[1] \n\t"
|
||||
"MOV v4.D[0], v4.D[1] \n\t"
|
||||
"USHR v11.2D, v10.2D, #26 \n\t"
|
||||
"MOV v9.D[0], v9.D[1] \n\t"
|
||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
||||
/* Copy r^2 to ARM */
|
||||
"MOV w25, v0.S[2] \n\t"
|
||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
||||
"MOV w26, v1.S[2] \n\t"
|
||||
"AND v10.16B, v10.16B, v27.16B \n\t"
|
||||
"MOV w27, v2.S[2] \n\t"
|
||||
"MOV w26, v1.S[2] \n\t"
|
||||
/* Two message blocks loaded */
|
||||
/* Add last messages */
|
||||
"ADD v21.2D, v21.2D, v10.2D \n\t"
|
||||
"MOV w28, v3.S[2] \n\t"
|
||||
"MOV w27, v2.S[2] \n\t"
|
||||
"ADD v22.2D, v22.2D, v11.2D \n\t"
|
||||
"MOV w30, v4.S[2] \n\t"
|
||||
"MOV w28, v3.S[2] \n\t"
|
||||
"ADD v23.2D, v23.2D, v12.2D \n\t"
|
||||
"MOV w30, v4.S[2] \n\t"
|
||||
"ADD v24.2D, v24.2D, v13.2D \n\t"
|
||||
/* Copy 5*r^2 to ARM */
|
||||
"MOV w15, v5.S[2] \n\t"
|
||||
"ADD v24.2D, v24.2D, v13.2D \n\t"
|
||||
"MOV w16, v6.S[2] \n\t"
|
||||
"ADD v25.2D, v25.2D, v14.2D \n\t"
|
||||
"MOV w17, v7.S[2] \n\t"
|
||||
"MOV w16, v6.S[2] \n\t"
|
||||
/* Reduce message to be ready for next multiplication */
|
||||
/* Reduce radix 26 NEON */
|
||||
/* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
|
||||
/* with h3 -> h4 -> h0 -> h1 */
|
||||
"USRA v22.2D, v21.2D, #26 \n\t"
|
||||
"MOV w18, v8.S[2] \n\t"
|
||||
"MOV w17, v7.S[2] \n\t"
|
||||
"AND v21.16B, v21.16B, v27.16B \n\t"
|
||||
"MOV w19, v9.S[2] \n\t"
|
||||
"MOV w8, v8.S[2] \n\t"
|
||||
"USRA v25.2D, v24.2D, #26 \n\t"
|
||||
"MOV w19, v9.S[2] \n\t"
|
||||
"AND v24.16B, v24.16B, v27.16B \n\t"
|
||||
"USHR v15.2D, v25.2D, #26 \n\t"
|
||||
"USRA v23.2D, v22.2D, #26 \n\t"
|
||||
@ -555,7 +551,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"MOV w15, v5.S[0] \n\t"
|
||||
"MOV w16, v6.S[0] \n\t"
|
||||
"MOV w17, v7.S[0] \n\t"
|
||||
"MOV w18, v8.S[0] \n\t"
|
||||
"MOV w8, v8.S[0] \n\t"
|
||||
"MOV w19, v9.S[0] \n\t"
|
||||
/* Load m */
|
||||
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
||||
@ -571,11 +567,11 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"USHR v11.2D, v10.2D, #26 \n\t"
|
||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
||||
"AND v10.16B, v10.16B, v27.16B \n\t"
|
||||
"MOV v10.S[1], v10.S[2] \n\t"
|
||||
"MOV v11.S[1], v11.S[2] \n\t"
|
||||
"MOV v12.S[1], v12.S[2] \n\t"
|
||||
"MOV v13.S[1], v13.S[2] \n\t"
|
||||
"MOV v14.S[1], v14.S[2] \n\t"
|
||||
"MOV v10.S[1], v10.S[2] \n\t"
|
||||
"MOV v11.S[1], v11.S[2] \n\t"
|
||||
"MOV v12.S[1], v12.S[2] \n\t"
|
||||
"MOV v13.S[1], v13.S[2] \n\t"
|
||||
"MOV v14.S[1], v14.S[2] \n\t"
|
||||
/* Two message blocks loaded */
|
||||
/* Add messages to accumulator */
|
||||
"ADD v15.2S, v15.2S, v10.2S \n\t"
|
||||
@ -618,7 +614,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
|
||||
"MADD x13, x21, x28, x13 \n\t"
|
||||
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
|
||||
"MADD x9, x22, x18, x9 \n\t"
|
||||
"MADD x9, x22, x8, x9 \n\t"
|
||||
"UMLAL v22.2D, v17.2S, v9.2S \n\t"
|
||||
"MADD x10, x22, x19, x10 \n\t"
|
||||
"UMLAL v23.2D, v17.2S, v0.2S \n\t"
|
||||
@ -630,7 +626,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
|
||||
"MADD x9, x23, x17, x9 \n\t"
|
||||
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
|
||||
"MADD x10, x23, x18, x10 \n\t"
|
||||
"MADD x10, x23, x8, x10 \n\t"
|
||||
"UMLAL v23.2D, v18.2S, v9.2S \n\t"
|
||||
"MADD x11, x23, x19, x11 \n\t"
|
||||
"UMLAL v24.2D, v18.2S, v0.2S \n\t"
|
||||
@ -642,7 +638,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v22.2D, v19.2S, v7.2S \n\t"
|
||||
"MADD x10, x24, x17, x10 \n\t"
|
||||
"UMLAL v23.2D, v19.2S, v8.2S \n\t"
|
||||
"MADD x11, x24, x18, x11 \n\t"
|
||||
"MADD x11, x24, x8, x11 \n\t"
|
||||
"UMLAL v24.2D, v19.2S, v9.2S \n\t"
|
||||
"MADD x12, x24, x19, x12 \n\t"
|
||||
"UMLAL v25.2D, v19.2S, v0.2S \n\t"
|
||||
@ -652,37 +648,37 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
|
||||
/* Reduce h % P */
|
||||
"MOV x14, #5 \n\t"
|
||||
"ADD x10, x10, x9, LSR #26 \n\t"
|
||||
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
|
||||
"ADD x13, x13, x12, LSR #26 \n\t"
|
||||
"ADD x10, x10, x9, LSR #26 \n\t"
|
||||
"USHR v14.2D, v11.2D, #40 \n\t"
|
||||
"AND x9, x9, #0x3ffffff \n\t"
|
||||
"ADD x13, x13, x12, LSR #26 \n\t"
|
||||
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
||||
"LSR x20, x13, #26 \n\t"
|
||||
"AND x9, x9, #0x3ffffff \n\t"
|
||||
"USHR v13.2D, v11.2D, #14 \n\t"
|
||||
"AND x12, x12, #0x3ffffff \n\t"
|
||||
"LSR x20, x13, #26 \n\t"
|
||||
"AND v13.16B, v13.16B, v27.16B \n\t"
|
||||
"MADD x9, x20, x14, x9 \n\t"
|
||||
"AND x12, x12, #0x3ffffff \n\t"
|
||||
"SHL v12.2D, v11.2D, #12 \n\t"
|
||||
"ADD x11, x11, x10, LSR #26 \n\t"
|
||||
"MADD x9, x20, x14, x9 \n\t"
|
||||
"SRI v12.2D, v10.2D, #52 \n\t"
|
||||
"AND x10, x10, #0x3ffffff \n\t"
|
||||
"ADD x11, x11, x10, LSR #26 \n\t"
|
||||
"AND v12.16B, v12.16B, v27.16B \n\t"
|
||||
"AND x13, x13, #0x3ffffff \n\t"
|
||||
"AND x10, x10, #0x3ffffff \n\t"
|
||||
"USHR v11.2D, v10.2D, #26 \n\t"
|
||||
"ADD x12, x12, x11, LSR #26 \n\t"
|
||||
"AND x13, x13, #0x3ffffff \n\t"
|
||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
||||
"AND x22, x11, #0x3ffffff \n\t"
|
||||
"ADD x12, x12, x11, LSR #26 \n\t"
|
||||
"AND v10.16B, v10.16B, v27.16B \n\t"
|
||||
"ADD x21, x10, x9, LSR #26 \n\t"
|
||||
"AND x22, x11, #0x3ffffff \n\t"
|
||||
/* Two message blocks loaded */
|
||||
"ADD v21.2D, v21.2D, v10.2D \n\t"
|
||||
"AND x20, x9, #0x3ffffff \n\t"
|
||||
"ADD x21, x10, x9, LSR #26 \n\t"
|
||||
"ADD v22.2D, v22.2D, v11.2D \n\t"
|
||||
"ADD x24, x13, x12, LSR #26 \n\t"
|
||||
"AND x20, x9, #0x3ffffff \n\t"
|
||||
"ADD v23.2D, v23.2D, v12.2D \n\t"
|
||||
"AND x23, x12, #0x3ffffff \n\t"
|
||||
"ADD x24, x13, x12, LSR #26 \n\t"
|
||||
"ADD v24.2D, v24.2D, v13.2D \n\t"
|
||||
"AND x23, x12, #0x3ffffff \n\t"
|
||||
"ADD v25.2D, v25.2D, v14.2D \n\t"
|
||||
/* Reduce radix 26 NEON */
|
||||
/* Interleave h0 -> h1 -> h2 -> h3 -> h4 */
|
||||
@ -752,7 +748,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMULL v22.2D, v15.2S, v1.2S \n\t"
|
||||
"MADD x13, x21, x28, x13 \n\t"
|
||||
"UMULL v23.2D, v15.2S, v2.2S \n\t"
|
||||
"MADD x9, x22, x18, x9 \n\t"
|
||||
"MADD x9, x22, x8, x9 \n\t"
|
||||
"UMULL v24.2D, v15.2S, v3.2S \n\t"
|
||||
"MADD x10, x22, x19, x10 \n\t"
|
||||
"UMULL v25.2D, v15.2S, v4.2S \n\t"
|
||||
@ -764,7 +760,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v23.2D, v16.2S, v1.2S \n\t"
|
||||
"MADD x9, x23, x17, x9 \n\t"
|
||||
"UMLAL v24.2D, v16.2S, v2.2S \n\t"
|
||||
"MADD x10, x23, x18, x10 \n\t"
|
||||
"MADD x10, x23, x8, x10 \n\t"
|
||||
"UMLAL v25.2D, v16.2S, v3.2S \n\t"
|
||||
"MADD x11, x23, x19, x11 \n\t"
|
||||
"UMLAL v21.2D, v17.2S, v8.2S \n\t"
|
||||
@ -776,7 +772,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
"UMLAL v24.2D, v17.2S, v1.2S \n\t"
|
||||
"MADD x10, x24, x17, x10 \n\t"
|
||||
"UMLAL v25.2D, v17.2S, v2.2S \n\t"
|
||||
"MADD x11, x24, x18, x11 \n\t"
|
||||
"MADD x11, x24, x8, x11 \n\t"
|
||||
"UMLAL v21.2D, v18.2S, v7.2S \n\t"
|
||||
"MADD x12, x24, x19, x12 \n\t"
|
||||
"UMLAL v22.2D, v18.2S, v8.2S \n\t"
|
||||
@ -866,12 +862,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
||||
: "memory", "cc",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
||||
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "w9", "w10",
|
||||
"w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20",
|
||||
"w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "w30", "x9",
|
||||
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
|
||||
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30",
|
||||
"v29", "v30"
|
||||
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||
"w8", "w9", "w10", "w11", "w12", "w13", "w14", "w15", "w16", "w17",
|
||||
"w19", "w20", "w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28",
|
||||
"w30", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
|
||||
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
|
||||
"x28", "x30"
|
||||
);
|
||||
poly1305_blocks_16(ctx, m, bytes);
|
||||
}
|
||||
@ -950,42 +946,42 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
"MUL x15, x19, x20 \n\t"
|
||||
"MUL x16, x19, x21 \n\t"
|
||||
"MUL x17, x19, x22 \n\t"
|
||||
"MUL x18, x19, x23 \n\t"
|
||||
"MUL x7, x19, x23 \n\t"
|
||||
"MADD x14, x20, x27, x14 \n\t"
|
||||
"MADD x15, x20, x19, x15 \n\t"
|
||||
"MADD x16, x20, x20, x16 \n\t"
|
||||
"MADD x17, x20, x21, x17 \n\t"
|
||||
"MADD x18, x20, x22, x18 \n\t"
|
||||
"MADD x7, x20, x22, x7 \n\t"
|
||||
"MADD x14, x21, x26, x14 \n\t"
|
||||
"MADD x15, x21, x27, x15 \n\t"
|
||||
"MADD x16, x21, x19, x16 \n\t"
|
||||
"MADD x17, x21, x20, x17 \n\t"
|
||||
"MADD x18, x21, x21, x18 \n\t"
|
||||
"MADD x7, x21, x21, x7 \n\t"
|
||||
"MADD x14, x22, x25, x14 \n\t"
|
||||
"MADD x15, x22, x26, x15 \n\t"
|
||||
"MADD x16, x22, x27, x16 \n\t"
|
||||
"MADD x17, x22, x19, x17 \n\t"
|
||||
"MADD x18, x22, x20, x18 \n\t"
|
||||
"MADD x7, x22, x20, x7 \n\t"
|
||||
"MADD x14, x23, x24, x14 \n\t"
|
||||
"MADD x15, x23, x25, x15 \n\t"
|
||||
"MADD x16, x23, x26, x16 \n\t"
|
||||
"MADD x17, x23, x27, x17 \n\t"
|
||||
"MADD x18, x23, x19, x18 \n\t"
|
||||
"MADD x7, x23, x19, x7 \n\t"
|
||||
/* r_2 = r^2 % P */
|
||||
"ADD x15, x15, x14, LSR #26 \n\t"
|
||||
"ADD x18, x18, x17, LSR #26 \n\t"
|
||||
"ADD x7, x7, x17, LSR #26 \n\t"
|
||||
"AND x14, x14, #0x3ffffff \n\t"
|
||||
"LSR x9, x18, #26 \n\t"
|
||||
"LSR x9, x7, #26 \n\t"
|
||||
"AND x17, x17, #0x3ffffff \n\t"
|
||||
"MADD x14, x9, x8, x14 \n\t"
|
||||
"ADD x16, x16, x15, LSR #26 \n\t"
|
||||
"AND x15, x15, #0x3ffffff \n\t"
|
||||
"AND x18, x18, #0x3ffffff \n\t"
|
||||
"AND x7, x7, #0x3ffffff \n\t"
|
||||
"ADD x17, x17, x16, LSR #26 \n\t"
|
||||
"AND x16, x16, #0x3ffffff \n\t"
|
||||
"ADD x15, x15, x14, LSR #26 \n\t"
|
||||
"AND x14, x14, #0x3ffffff \n\t"
|
||||
"ADD x18, x18, x17, LSR #26 \n\t"
|
||||
"ADD x7, x7, x17, LSR #26 \n\t"
|
||||
"AND x17, x17, #0x3ffffff \n\t"
|
||||
/* Store r */
|
||||
"ORR x19, x19, x20, LSL #32 \n\t"
|
||||
@ -996,7 +992,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
"MUL x24, x15, x8 \n\t"
|
||||
"MUL x25, x16, x8 \n\t"
|
||||
"MUL x26, x17, x8 \n\t"
|
||||
"MUL x27, x18, x8 \n\t"
|
||||
"MUL x27, x7, x8 \n\t"
|
||||
/* Compute r^4 */
|
||||
/* d0 = h0 * r0 + h1 * s4 + h2 * s3 + h3 * s2 + h4 * s1 */
|
||||
/* d1 = h0 * r1 + h1 * r0 + h2 * s4 + h3 * s3 + h4 * s2 */
|
||||
@ -1007,7 +1003,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
"MUL x20, x14, x15 \n\t"
|
||||
"MUL x21, x14, x16 \n\t"
|
||||
"MUL x22, x14, x17 \n\t"
|
||||
"MUL x23, x14, x18 \n\t"
|
||||
"MUL x23, x14, x7 \n\t"
|
||||
"MADD x19, x15, x27, x19 \n\t"
|
||||
"MADD x20, x15, x14, x20 \n\t"
|
||||
"MADD x21, x15, x15, x21 \n\t"
|
||||
@ -1023,11 +1019,11 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
"MADD x21, x17, x27, x21 \n\t"
|
||||
"MADD x22, x17, x14, x22 \n\t"
|
||||
"MADD x23, x17, x15, x23 \n\t"
|
||||
"MADD x19, x18, x24, x19 \n\t"
|
||||
"MADD x20, x18, x25, x20 \n\t"
|
||||
"MADD x21, x18, x26, x21 \n\t"
|
||||
"MADD x22, x18, x27, x22 \n\t"
|
||||
"MADD x23, x18, x14, x23 \n\t"
|
||||
"MADD x19, x7, x24, x19 \n\t"
|
||||
"MADD x20, x7, x25, x20 \n\t"
|
||||
"MADD x21, x7, x26, x21 \n\t"
|
||||
"MADD x22, x7, x27, x22 \n\t"
|
||||
"MADD x23, x7, x14, x23 \n\t"
|
||||
/* r^4 % P */
|
||||
"ADD x20, x20, x19, LSR #26 \n\t"
|
||||
"ADD x23, x23, x22, LSR #26 \n\t"
|
||||
@ -1048,7 +1044,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
"ORR x14, x14, x15, LSL #32 \n\t"
|
||||
"ORR x16, x16, x17, LSL #32 \n\t"
|
||||
"STP x14, x16, [%[ctx_r_2]] \n\t"
|
||||
"STR w18, [%[ctx_r_2], #16] \n\t"
|
||||
"STR w7, [%[ctx_r_2], #16] \n\t"
|
||||
/* Store r^4 */
|
||||
"ORR x19, x19, x20, LSL #32 \n\t"
|
||||
"ORR x21, x21, x22, LSL #32 \n\t"
|
||||
@ -1074,9 +1070,9 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
|
||||
[ctx_leftover] "r" (&ctx->leftover),
|
||||
[ctx_finished] "r" (&ctx->finished)
|
||||
: "memory", "cc",
|
||||
"w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
|
||||
"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",
|
||||
"x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
|
||||
"w7", "w14", "w15", "w16", "w17", "w19", "w20", "w21", "w22", "w23",
|
||||
"x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
|
||||
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
|
||||
);
|
||||
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user