mirror of
https://github.com/wolfSSL/wolfssl.git
synced 2025-07-30 18:57:27 +02:00
Merge pull request #2332 from SparkiDev/poly1305_arm64
Improve perfomance of Poly1305 on ARM64
This commit is contained in:
@ -195,11 +195,10 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"BLO L_poly1305_64_done_%= \n\t"
|
"BLO L_poly1305_64_done_%= \n\t"
|
||||||
"MOV x9, #0x3ffffff \n\t"
|
"MOV x9, #0x3ffffff \n\t"
|
||||||
/* Load h */
|
/* Load h */
|
||||||
"LDP x20, x22, [%[h]], #16 \n\t"
|
"LDP x20, x22, [%[h]] \n\t"
|
||||||
"MOV v27.D[0], x9 \n\t"
|
"MOV v27.D[0], x9 \n\t"
|
||||||
"LDR w24, [%[h]] \n\t"
|
"LDR w24, [%[h], #16] \n\t"
|
||||||
"MOV v27.D[1], x9 \n\t"
|
"MOV v27.D[1], x9 \n\t"
|
||||||
"SUB %[h], %[h], #16 \n\t"
|
|
||||||
"MOV x9, #5 \n\t"
|
"MOV x9, #5 \n\t"
|
||||||
"LSR x21, x20, #32 \n\t"
|
"LSR x21, x20, #32 \n\t"
|
||||||
"MOV v28.D[0], x9 \n\t"
|
"MOV v28.D[0], x9 \n\t"
|
||||||
@ -258,26 +257,23 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"MOV w19, v9.S[0] \n\t"
|
"MOV w19, v9.S[0] \n\t"
|
||||||
/* Load m */
|
/* Load m */
|
||||||
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
|
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
|
||||||
"LD4 { v10.S-v13.S }[0], [%[m]], #16 \n\t"
|
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
|
||||||
"LD4 { v10.S-v13.S }[1], [%[m]], #16 \n\t"
|
|
||||||
"LD4 { v10.S-v13.S }[2], [%[m]], #16 \n\t"
|
|
||||||
"LD4 { v10.S-v13.S }[3], [%[m]], #16 \n\t"
|
|
||||||
"SUB %[bytes], %[bytes], #4*%[POLY1305_BLOCK_SIZE] \n\t"
|
"SUB %[bytes], %[bytes], #4*%[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"DUP v27.4S, v27.S[0] \n\t"
|
"DUP v29.4S, v27.S[0] \n\t"
|
||||||
"DUP v26.4S, v26.S[0] \n\t"
|
"DUP v30.4S, v26.S[0] \n\t"
|
||||||
"USHR v14.4S, v13.4S, #8 \n\t"
|
"USHR v14.4S, v13.4S, #8 \n\t"
|
||||||
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
"ORR v14.16B, v14.16B, v30.16B \n\t"
|
||||||
"SHL v13.4S, v13.4S, #18 \n\t"
|
"SHL v13.4S, v13.4S, #18 \n\t"
|
||||||
"SRI v13.4S, v12.4S, #14 \n\t"
|
"SRI v13.4S, v12.4S, #14 \n\t"
|
||||||
"SHL v12.4S, v12.4S, #12 \n\t"
|
"SHL v12.4S, v12.4S, #12 \n\t"
|
||||||
"SRI v12.4S, v11.4S, #20 \n\t"
|
"SRI v12.4S, v11.4S, #20 \n\t"
|
||||||
"SHL v11.4S, v11.4S, #6 \n\t"
|
"SHL v11.4S, v11.4S, #6 \n\t"
|
||||||
"SRI v11.4S, v10.4S, #26 \n\t"
|
"SRI v11.4S, v10.4S, #26 \n\t"
|
||||||
"AND v10.16B, v10.16B, v27.16B \n\t"
|
"AND v10.16B, v10.16B, v29.16B \n\t"
|
||||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
"AND v11.16B, v11.16B, v29.16B \n\t"
|
||||||
"AND v12.16B, v12.16B, v27.16B \n\t"
|
"AND v12.16B, v12.16B, v29.16B \n\t"
|
||||||
"AND v13.16B, v13.16B, v27.16B \n\t"
|
"AND v13.16B, v13.16B, v29.16B \n\t"
|
||||||
"AND v14.16B, v14.16B, v27.16B \n\t"
|
"AND v14.16B, v14.16B, v29.16B \n\t"
|
||||||
"MOV v27.S[1], wzr \n\t"
|
"MOV v27.S[1], wzr \n\t"
|
||||||
"MOV v27.S[3], wzr \n\t"
|
"MOV v27.S[3], wzr \n\t"
|
||||||
"MOV v26.S[1], wzr \n\t"
|
"MOV v26.S[1], wzr \n\t"
|
||||||
@ -405,30 +401,21 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"BLS L_poly1305_64_loop_128_final_%= \n\t"
|
"BLS L_poly1305_64_loop_128_final_%= \n\t"
|
||||||
/* Load m */
|
/* Load m */
|
||||||
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
|
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
|
||||||
"LD4 { v10.S-v13.S }[0], [%[m]], #16 \n\t"
|
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
|
||||||
"LD4 { v10.S-v13.S }[1], [%[m]], #16 \n\t"
|
|
||||||
"LD4 { v10.S-v13.S }[2], [%[m]], #16 \n\t"
|
|
||||||
"LD4 { v10.S-v13.S }[3], [%[m]], #16 \n\t"
|
|
||||||
"SUB %[bytes], %[bytes], #4*%[POLY1305_BLOCK_SIZE] \n\t"
|
"SUB %[bytes], %[bytes], #4*%[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"DUP v27.4S, v27.S[0] \n\t"
|
|
||||||
"DUP v26.4S, v26.S[0] \n\t"
|
|
||||||
"USHR v14.4S, v13.4S, #8 \n\t"
|
"USHR v14.4S, v13.4S, #8 \n\t"
|
||||||
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
"ORR v14.16B, v14.16B, v30.16B \n\t"
|
||||||
"SHL v13.4S, v13.4S, #18 \n\t"
|
"SHL v13.4S, v13.4S, #18 \n\t"
|
||||||
"SRI v13.4S, v12.4S, #14 \n\t"
|
"SRI v13.4S, v12.4S, #14 \n\t"
|
||||||
"SHL v12.4S, v12.4S, #12 \n\t"
|
"SHL v12.4S, v12.4S, #12 \n\t"
|
||||||
"SRI v12.4S, v11.4S, #20 \n\t"
|
"SRI v12.4S, v11.4S, #20 \n\t"
|
||||||
"SHL v11.4S, v11.4S, #6 \n\t"
|
"SHL v11.4S, v11.4S, #6 \n\t"
|
||||||
"SRI v11.4S, v10.4S, #26 \n\t"
|
"SRI v11.4S, v10.4S, #26 \n\t"
|
||||||
"AND v10.16B, v10.16B, v27.16B \n\t"
|
"AND v10.16B, v10.16B, v29.16B \n\t"
|
||||||
"AND v11.16B, v11.16B, v27.16B \n\t"
|
"AND v11.16B, v11.16B, v29.16B \n\t"
|
||||||
"AND v12.16B, v12.16B, v27.16B \n\t"
|
"AND v12.16B, v12.16B, v29.16B \n\t"
|
||||||
"AND v13.16B, v13.16B, v27.16B \n\t"
|
"AND v13.16B, v13.16B, v29.16B \n\t"
|
||||||
"AND v14.16B, v14.16B, v27.16B \n\t"
|
"AND v14.16B, v14.16B, v29.16B \n\t"
|
||||||
"MOV v27.S[1], wzr \n\t"
|
|
||||||
"MOV v27.S[3], wzr \n\t"
|
|
||||||
"MOV v26.S[1], wzr \n\t"
|
|
||||||
"MOV v26.S[3], wzr \n\t"
|
|
||||||
/* Four message blocks loaded */
|
/* Four message blocks loaded */
|
||||||
/* Add new message block to accumulator */
|
/* Add new message block to accumulator */
|
||||||
"UADDW v21.2D, v21.2D, v10.2S \n\t"
|
"UADDW v21.2D, v21.2D, v10.2S \n\t"
|
||||||
@ -469,10 +456,9 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"L_poly1305_64_loop_128_final_%=: \n\t"
|
"L_poly1305_64_loop_128_final_%=: \n\t"
|
||||||
/* Load m */
|
/* Load m */
|
||||||
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
||||||
"LD2 { v10.D-v11.D }[0], [%[m]], #16 \n\t"
|
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
|
||||||
/* Copy r^2 to lower half of registers */
|
/* Copy r^2 to lower half of registers */
|
||||||
"MOV v0.D[0], v0.D[1] \n\t"
|
"MOV v0.D[0], v0.D[1] \n\t"
|
||||||
"LD2 { v10.D-v11.D }[1], [%[m]], #16 \n\t"
|
|
||||||
"MOV v5.D[0], v5.D[1] \n\t"
|
"MOV v5.D[0], v5.D[1] \n\t"
|
||||||
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"MOV v1.D[0], v1.D[1] \n\t"
|
"MOV v1.D[0], v1.D[1] \n\t"
|
||||||
@ -572,8 +558,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"MOV w19, v9.S[0] \n\t"
|
"MOV w19, v9.S[0] \n\t"
|
||||||
/* Load m */
|
/* Load m */
|
||||||
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
||||||
"LD2 { v10.D-v11.D }[0], [%[m]], #16 \n\t"
|
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
|
||||||
"LD2 { v10.D-v11.D }[1], [%[m]], #16 \n\t"
|
|
||||||
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"USHR v14.2D, v11.2D, #40 \n\t"
|
"USHR v14.2D, v11.2D, #40 \n\t"
|
||||||
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
"ORR v14.16B, v14.16B, v26.16B \n\t"
|
||||||
@ -663,10 +648,9 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"MADD x13, x24, x25, x13 \n\t"
|
"MADD x13, x24, x25, x13 \n\t"
|
||||||
/* Load m */
|
/* Load m */
|
||||||
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
|
||||||
"LD2 { v10.D-v11.D }[0], [%[m]], #16 \n\t"
|
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
|
||||||
/* Reduce h % P */
|
/* Reduce h % P */
|
||||||
"MOV x14, #5 \n\t"
|
"MOV x14, #5 \n\t"
|
||||||
"LD2 { v10.D-v11.D }[1], [%[m]], #16 \n\t"
|
|
||||||
"ADD x10, x10, x9, LSR #26 \n\t"
|
"ADD x10, x10, x9, LSR #26 \n\t"
|
||||||
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
"SUB %[bytes], %[bytes], #2*%[POLY1305_BLOCK_SIZE] \n\t"
|
||||||
"ADD x13, x13, x12, LSR #26 \n\t"
|
"ADD x13, x13, x12, LSR #26 \n\t"
|
||||||
@ -885,7 +869,8 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
|
|||||||
"w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20",
|
"w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20",
|
||||||
"w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "w29", "x9",
|
"w21", "w22", "w23", "w24", "w25", "w26", "w27", "w28", "w29", "x9",
|
||||||
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
|
"x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
|
||||||
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29"
|
"x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29",
|
||||||
|
"v29", "v30"
|
||||||
);
|
);
|
||||||
poly1305_blocks_16(ctx, m, bytes);
|
poly1305_blocks_16(ctx, m, bytes);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user