ARMv8 : increase performance

This commit is contained in:
Jacob Barthelmeh
2016-10-05 09:04:18 -06:00
parent 6f4b704552
commit 21dd236ef2
2 changed files with 994 additions and 508 deletions

View File

@ -485,8 +485,8 @@ int wc_InitAes_h(Aes* aes, void* h)
"LD1 {v5.2d-v8.2d}, %[Key], #64 \n"
"LD1 {v9.2d-v11.2d},%[Key], #48 \n"
"LD1 {v0.2d}, %[reg] \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"AESCBC128Block:\n"
"#CBC operations, xorbuf in with current aes->reg \n"
"EOR v0.16b, v0.16b, v12.16b \n"
@ -509,8 +509,8 @@ int wc_InitAes_h(Aes* aes, void* h)
"AESE v0.16b, v9.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v0.16b, v10.16b \n"
"EOR v0.16b, v0.16b, v11.16b \n"
"SUB w11, w11, #1 \n"
"EOR v0.16b, v0.16b, v11.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"CBZ w11, AESCBC128end \n"
@ -525,7 +525,7 @@ int wc_InitAes_h(Aes* aes, void* h)
:"0" (out), [Key] "m" (aes->key), [input] "r" (in),
[blocks] "r" (numBlocks), [reg] "m" (aes->reg)
: "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12"
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
);
break;
@ -905,7 +905,85 @@ int wc_InitAes_h(Aes* aes, void* h)
"LD1 {v9.2d-v11.2d}, [%[Key]], #48\n"
"LD1 {v13.2d}, %[reg] \n"
/* double block */
"AESCTR128Block2: \n"
"CMP w11, #1 \n"
"BEQ AESCTR128Block \n"
"CMP w11, #0 \n"
"BEQ AESCTRend \n"
"MOV v0.16b, v13.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v13.16b, v13.16b \n" /* network order */
"AESE v0.16b, v2.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v13.16b, v13.16b, v13.16b, #8 \n"
"SUB w11, w11, #2 \n"
"ADD v15.2d, v13.2d, v14.2d \n" /* add 1 to counter */
"ADD v13.2d, v15.2d, v14.2d \n" /* add 1 to counter */
"AESE v0.16b, v3.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v15.16b, v15.16b, v15.16b, #8 \n"
"EXT v13.16b, v13.16b, v13.16b, #8 \n"
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v15.16b, v15.16b \n" /* revert from network order */
"REV64 v13.16b, v13.16b \n" /* revert from network order */
"AESE v0.16b, v5.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v15.16b, v1.16b \n"
"AESMC v15.16b, v15.16b \n"
"AESE v0.16b, v6.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v15.16b, v2.16b \n"
"AESMC v15.16b, v15.16b \n"
"AESE v0.16b, v7.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v15.16b, v3.16b \n"
"AESMC v15.16b, v15.16b \n"
"AESE v0.16b, v8.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v15.16b, v4.16b \n"
"AESMC v15.16b, v15.16b \n"
"AESE v0.16b, v9.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v15.16b, v5.16b \n"
"AESMC v15.16b, v15.16b \n"
"AESE v0.16b, v10.16b \n"
"AESE v15.16b, v6.16b \n"
"AESMC v15.16b, v15.16b \n"
"EOR v0.16b, v0.16b, v11.16b \n"
"AESE v15.16b, v7.16b \n"
"AESMC v15.16b, v15.16b \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"AESE v15.16b, v8.16b \n"
"AESMC v15.16b, v15.16b \n"
"EOR v0.16b, v0.16b, v12.16b \n"
"AESE v15.16b, v9.16b \n"
"AESMC v15.16b, v15.16b \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"AESE v15.16b, v10.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"EOR v15.16b, v15.16b, v11.16b \n"
"EOR v15.16b, v15.16b, v12.16b \n"
"ST1 {v15.2d}, [%[out]], #16 \n"
"B AESCTR128Block2 \n"
/* single block */
"AESCTR128Block: \n"
"MOV v0.16b, v13.16b \n"
"AESE v0.16b, v1.16b \n"
@ -935,13 +1013,10 @@ int wc_InitAes_h(Aes* aes, void* h)
"AESE v0.16b, v10.16b \n"
"EOR v0.16b, v0.16b, v11.16b \n"
"#CTR operations, increment counter and xorbuf \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"EOR v0.16b, v0.16b, v12.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"CBZ w11, AESCTRend \n"
"LD1 {v12.2d}, [%[input]], #16 \n"
"B AESCTR128Block \n"
"AESCTRend: \n"
"#store current counter value at the end \n"
"ST1 {v13.2d}, %[regOut] \n"
@ -970,9 +1045,97 @@ int wc_InitAes_h(Aes* aes, void* h)
"LD1 {v9.2d-v12.2d}, [%[Key]], #64\n"
"LD1 {v15.2d}, %[reg] \n"
"LD1 {v13.16b}, [%[Key]], #16 \n"
/* double block */
"AESCTR192Block2: \n"
"CMP w11, #1 \n"
"BEQ AESCTR192Block \n"
"CMP w11, #0 \n"
"BEQ AESCTR192end \n"
"MOV v0.16b, v15.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v15.16b, v15.16b \n" /* network order */
"AESE v0.16b, v2.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v15.16b, v15.16b, v15.16b, #8 \n"
"SUB w11, w11, #2 \n"
"ADD v17.2d, v15.2d, v16.2d \n" /* add 1 to counter */
"ADD v15.2d, v17.2d, v16.2d \n" /* add 1 to counter */
"AESE v0.16b, v3.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v17.16b, v17.16b, v17.16b, #8 \n"
"EXT v15.16b, v15.16b, v15.16b, #8 \n"
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v17.16b, v17.16b \n" /* revert from network order */
"REV64 v15.16b, v15.16b \n" /* revert from network order */
"AESE v0.16b, v5.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v1.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v6.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v2.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v7.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v3.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v8.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v4.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v9.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v5.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v10.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v6.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v11.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v17.16b, v7.16b \n"
"AESMC v17.16b, v17.16b \n"
"AESE v0.16b, v12.16b \n"
"AESE v17.16b, v8.16b \n"
"AESMC v17.16b, v17.16b \n"
"EOR v0.16b, v0.16b, v13.16b \n"
"AESE v17.16b, v9.16b \n"
"AESMC v17.16b, v17.16b \n"
"LD1 {v14.2d}, [%[input]], #16 \n"
"AESE v17.16b, v10.16b \n"
"AESMC v17.16b, v17.16b \n"
"EOR v0.16b, v0.16b, v14.16b \n"
"AESE v17.16b, v11.16b \n"
"AESMC v17.16b, v17.16b \n"
"LD1 {v14.2d}, [%[input]], #16 \n"
"AESE v17.16b, v12.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"EOR v17.16b, v17.16b, v13.16b \n"
"EOR v17.16b, v17.16b, v14.16b \n"
"ST1 {v17.2d}, [%[out]], #16 \n"
"B AESCTR192Block2 \n"
"AESCTR192Block: \n"
"LD1 {v14.2d}, [%[input]], #16 \n"
"MOV v0.16b, v15.16b \n"
"AESE v0.16b, v1.16b \n"
@ -1009,10 +1172,6 @@ int wc_InitAes_h(Aes* aes, void* h)
"EOR v0.16b, v0.16b, v14.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"CBZ w11, AESCTR192end \n"
"LD1 {v14.2d}, [%[input]], #16 \n"
"B AESCTR192Block \n"
"AESCTR192end: \n"
"#store current counter value at the end \n"
"ST1 {v15.2d}, %[regOut] \n"
@ -1023,7 +1182,7 @@ int wc_InitAes_h(Aes* aes, void* h)
[blocks] "r" (numBlocks), [reg] "m" (aes->reg)
: "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15",
"v16"
"v16", "v17"
);
break;
@ -1043,8 +1202,106 @@ int wc_InitAes_h(Aes* aes, void* h)
"LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n"
"LD1 {v17.2d}, %[reg] \n"
/* double block */
"AESCTR256Block2: \n"
"CMP w11, #1 \n"
"BEQ AESCTR256Block \n"
"CMP w11, #0 \n"
"BEQ AESCTR256end \n"
"MOV v0.16b, v17.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v17.16b, v17.16b \n" /* network order */
"AESE v0.16b, v2.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v19.16b, v17.16b, v18.16b, #8 \n"
"SUB w11, w11, #2 \n"
"ADD v19.2d, v17.2d, v18.2d \n" /* add 1 to counter */
"ADD v17.2d, v19.2d, v18.2d \n" /* add 1 to counter */
"AESE v0.16b, v3.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v19.16b, v19.16b, v19.16b, #8 \n"
"EXT v17.16b, v17.16b, v17.16b, #8 \n"
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"REV64 v19.16b, v19.16b \n" /* revert from network order */
"REV64 v17.16b, v17.16b \n" /* revert from network order */
"AESE v0.16b, v5.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v1.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v6.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v2.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v7.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v3.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v8.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v4.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v9.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v5.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v10.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v6.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v11.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v7.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v12.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v8.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v13.16b \n"
"AESMC v0.16b, v0.16b \n"
"AESE v19.16b, v9.16b \n"
"AESMC v19.16b, v19.16b \n"
"AESE v0.16b, v14.16b \n"
"AESE v19.16b, v10.16b \n"
"AESMC v19.16b, v19.16b \n"
"EOR v0.16b, v0.16b, v15.16b \n"
"AESE v19.16b, v11.16b \n"
"AESMC v19.16b, v19.16b \n"
"LD1 {v16.2d}, [%[input]], #16 \n"
"AESE v19.16b, v12.16b \n"
"AESMC v19.16b, v19.16b \n"
"EOR v0.16b, v0.16b, v16.16b \n"
"AESE v19.16b, v13.16b \n"
"AESMC v19.16b, v19.16b \n"
"LD1 {v16.2d}, [%[input]], #16 \n"
"AESE v19.16b, v14.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"EOR v19.16b, v19.16b, v15.16b \n"
"EOR v19.16b, v19.16b, v16.16b \n"
"ST1 {v19.2d}, [%[out]], #16 \n"
"B AESCTR256Block2 \n"
"AESCTR256Block: \n"
"LD1 {v16.2d}, [%[input]], #16 \n"
"MOV v0.16b, v17.16b \n"
"AESE v0.16b, v1.16b \n"
"AESMC v0.16b, v0.16b \n"
@ -1057,7 +1314,6 @@ int wc_InitAes_h(Aes* aes, void* h)
"ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */
"AESE v0.16b, v4.16b \n"
"AESMC v0.16b, v0.16b \n"
"SUB w11, w11, #1 \n"
"AESE v0.16b, v5.16b \n"
"AESMC v0.16b, v0.16b \n"
"EXT v17.16b, v17.16b, v17.16b, #8 \n"
@ -1084,10 +1340,6 @@ int wc_InitAes_h(Aes* aes, void* h)
"EOR v0.16b, v0.16b, v16.16b \n"
"ST1 {v0.2d}, [%[out]], #16 \n"
"CBZ w11, AESCTR256end \n"
"LD1 {v16.2d}, [%[input]], #16 \n"
"B AESCTR256Block \n"
"AESCTR256end: \n"
"#store current counter value at the end \n"
"ST1 {v17.2d}, %[regOut] \n"
@ -1456,7 +1708,7 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
,[inX] "4" (xPt), [inY] "m" (aes->H)
: "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
);
}
@ -1546,9 +1798,13 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
);
if (authTagSz > AES_BLOCK_SIZE) {
XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
}
else {
/* authTagSz can be smaller than AES_BLOCK_SIZE */
XMEMCPY(authTag, scratch, authTagSz);
}
return 0;
}
@ -1777,7 +2033,7 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
,[inX] "4" (xPt), [inY] "m" (aes->H)
: "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
);
}
@ -1872,8 +2128,13 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
);
if (authTagSz > AES_BLOCK_SIZE) {
XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
}
else {
/* authTagSz can be smaller than AES_BLOCK_SIZE */
XMEMCPY(authTag, scratch, authTagSz);
}
return 0;
}
@ -2111,7 +2372,7 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
,[inX] "4" (xPt), [inY] "m" (aes->H)
: "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12"
,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
);
}
@ -2210,8 +2471,13 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
);
if (authTagSz > AES_BLOCK_SIZE) {
XMEMCPY(authTag, scratch, AES_BLOCK_SIZE);
}
else {
/* authTagSz can be smaller than AES_BLOCK_SIZE */
XMEMCPY(authTag, scratch, authTagSz);
}
return 0;
}
@ -2244,11 +2510,6 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
const byte* authIn, word32 authInSz)
{
/* sanity checks */
if (authTagSz > AES_BLOCK_SIZE) {
WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes");
return BAD_FUNC_ARG; /* is bigger then scratch buffer */
}
if (aes == NULL || (iv == NULL && ivSz > 0) ||
(authTag == NULL && authTagSz > 0) ||
(authIn == NULL && authInSz > 0) ||
@ -2309,11 +2570,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
ctr = counter ;
/* sanity checks */
if (authTagSz > AES_BLOCK_SIZE) {
WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes");
return BAD_FUNC_ARG; /* is bigger then scratch buffer */
}
if (aes == NULL || (iv == NULL && ivSz > 0) ||
(authTag == NULL && authTagSz > 0) ||
(authIn == NULL && authInSz > 0) ||
@ -3246,27 +3502,93 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
case 10: /* AES 128 BLOCK */
__asm__ __volatile__ (
"MOV r11, %r[blocks] \n"
"VLD1.32 {q1}, [%[Key]]! \n"
"VLD1.32 {q2}, [%[Key]]! \n"
"VLD1.32 {q3}, [%[Key]]! \n"
"VLD1.32 {q4}, [%[Key]]! \n"
"VLDM %[Key]!, {q1-q4} \n"
"#Create vector with the value 1 \n"
"VMOV.u32 q15, #1 \n"
"VSHR.u64 q15, q15, #32 \n"
"VLD1.32 {q5}, [%[Key]]! \n"
"VLD1.32 {q6}, [%[Key]]! \n"
"VLD1.32 {q7}, [%[Key]]! \n"
"VLD1.32 {q8}, [%[Key]]! \n"
"VLDM %[Key]!, {q5-q8} \n"
"VEOR.32 q14, q14, q14 \n"
"VLDM %[Key]!, {q9-q11} \n"
"VEXT.8 q14, q15, q14, #8\n"
"VLD1.32 {q9}, [%[Key]]! \n"
"VLD1.32 {q10}, [%[Key]]! \n"
"VLD1.32 {q11}, [%[Key]]! \n"
"VLD1.32 {q13}, [%[reg]]\n"
/* double block */
"AESCTR128Block2: \n"
"CMP r11, #1 \n"
"BEQ AESCTR128Block \n"
"CMP r11, #0 \n"
"BEQ AESCTRend \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q13, q13 \n" /* network order */
"AESE.8 q0, q2\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q13, q13, q13, #8 \n"
"SUB r11, r11, #2 \n"
"VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
"VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
"AESE.8 q0, q3\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q15, q15, q15, #8 \n"
"VEXT.8 q13, q13, q13, #8 \n"
"AESE.8 q0, q4\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q15, q15\n" /* revert from network order */
"VREV64.8 q13, q13\n" /* revert from network order */
"AESE.8 q0, q5\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q1\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q6\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q2\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q7\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q3\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q8\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q4\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q9\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q5\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q10\n"
"AESE.8 q15, q6\n"
"AESMC.8 q15, q15\n"
"VEOR.32 q0, q0, q11\n"
"AESE.8 q15, q7\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q12}, [%[input]]! \n"
"AESE.8 q15, q8\n"
"AESMC.8 q15, q15\n"
"VEOR.32 q0, q0, q12\n"
"AESE.8 q15, q9\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q12}, [%[input]]! \n"
"AESE.8 q15, q10\n"
"VST1.32 {q0}, [%[out]]! \n"
"VEOR.32 q15, q15, q11\n"
"VEOR.32 q15, q15, q12\n"
"VST1.32 {q15}, [%[out]]! \n"
"B AESCTR128Block2 \n"
/* single block */
"AESCTR128Block: \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
@ -3299,11 +3621,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"VEOR.32 q0, q0, q12\n"
"VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n"
"BEQ AESCTRend \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B AESCTR128Block \n"
"AESCTRend: \n"
"#store current counter qalue at the end \n"
"VST1.32 {q13}, [%[regOut]] \n"
@ -3313,7 +3630,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
:"0" (out), [Key] "1" (keyPt), [input] "3" (in),
[blocks] "r" (numBlocks), [reg] "2" (regPt)
: "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5",
"q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14"
"q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14", "q15"
);
break;
@ -3339,8 +3656,99 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"VLD1.32 {q10}, [%[Key]]!\n"
"VLD1.32 {q13}, [%[reg]]\n"
/* double block */
"AESCTR192Block2: \n"
"CMP r11, #1 \n"
"BEQ AESCTR192Block \n"
"CMP r11, #0 \n"
"BEQ AESCTR192end \n"
"VMOV.32 q0, q13\n"
"AESE.8 q0, q1\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q13, q13 \n" /* network order */
"AESE.8 q0, q2\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q13, q13, q13, #8 \n"
"SUB r11, r11, #2 \n"
"VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
"VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
"AESE.8 q0, q3\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q15, q15, q15, #8 \n"
"VEXT.8 q13, q13, q13, #8 \n"
"AESE.8 q0, q4\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q15, q15\n" /* revert from network order */
"VREV64.8 q13, q13\n" /* revert from network order */
"AESE.8 q0, q5\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q1\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q6\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q2\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q7\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q3\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q8\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q4\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q9\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q5\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q10\n"
"AESMC.8 q0, q0\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q15, q6\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q11\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q7\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q8\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q9\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q12}, [%[input]]! \n"
"AESE.8 q15, q10\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q11\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n"
"AESE.8 q15, q11\n"
"VLD1.32 {q11}, [%[Key]] \n"
"VEOR.32 q0, q0, q11\n"
"VEOR.32 q15, q15, q11\n"
"VEOR.32 q0, q0, q12\n"
"VLD1.32 {q12}, [%[input]]! \n"
"VST1.32 {q0}, [%[out]]! \n"
"VEOR.32 q15, q15, q12\n"
"VST1.32 {q15}, [%[out]]! \n"
"SUB %[Key], %[Key], #32 \n"
"B AESCTR192Block2 \n"
/* single block */
"AESCTR192Block: \n"
"VLD1.32 {q12}, [%[input]]! \n"
"VLD1.32 {q11}, [%[Key]]! \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
@ -3375,16 +3783,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"AESE.8 q0, q11\n"
"VLD1.32 {q11}, [%[Key]] \n"
"VEOR.32 q0, q0, q11\n"
"SUB %[Key], %[Key], #32 \n"
"#CTR operations, increment counter and xorbuf \n"
"VEOR.32 q0, q0, q12\n"
"VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n"
"BEQ AESCTR192end \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B AESCTR192Block \n"
"AESCTR192end: \n"
"#store current counter qalue at the end \n"
"VST1.32 {q13}, [%[regOut]] \n"
@ -3420,8 +3822,111 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"VLD1.32 {q10}, [%[Key]]! \n"
"VLD1.32 {q13}, [%[reg]]\n"
/* double block */
"AESCTR256Block2: \n"
"CMP r11, #1 \n"
"BEQ AESCTR256Block \n"
"CMP r11, #0 \n"
"BEQ AESCTR256end \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q13, q13 \n" /* network order */
"AESE.8 q0, q2\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q13, q13, q13, #8 \n"
"SUB r11, r11, #2 \n"
"VADD.i32 q15, q13, q14 \n" /* add 1 to counter */
"VADD.i32 q13, q15, q14 \n" /* add 1 to counter */
"AESE.8 q0, q3\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q15, q15, q15, #8 \n"
"VEXT.8 q13, q13, q13, #8 \n"
"AESE.8 q0, q4\n"
"AESMC.8 q0, q0\n"
"VREV64.8 q15, q15\n" /* revert from network order */
"VREV64.8 q13, q13\n" /* revert from network order */
"AESE.8 q0, q5\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q1\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q6\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q2\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q7\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q3\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q8\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q4\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q9\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q5\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q10\n"
"AESMC.8 q0, q0\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q15, q6\n"
"AESMC.8 q15, q15\n"
"AESE.8 q0, q11\n"
"AESMC.8 q0, q0\n"
"AESE.8 q15, q7\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q8\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q9\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q12}, [%[input]]! \n"
"AESE.8 q15, q10\n"
"AESMC.8 q15, q15\n"
"AESE.8 q15, q11\n"
"AESMC.8 q15, q15\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n" /* rnd 12*/
"AESMC.8 q0, q0\n"
"AESE.8 q15, q11\n" /* rnd 12 */
"AESMC.8 q15, q15\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n" /* rnd 13 */
"AESMC.8 q0, q0\n"
"AESE.8 q15, q11\n" /* rnd 13 */
"AESMC.8 q15, q15\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n" /* rnd 14 */
"AESE.8 q15, q11\n" /* rnd 14 */
"VLD1.32 {q11}, [%[Key]] \n"
"VEOR.32 q0, q0, q11\n" /* rnd 15 */
"VEOR.32 q15, q15, q11\n" /* rnd 15 */
"VEOR.32 q0, q0, q12\n"
"VLD1.32 {q12}, [%[input]]! \n"
"VST1.32 {q0}, [%[out]]! \n"
"VEOR.32 q15, q15, q12\n"
"VST1.32 {q15}, [%[out]]! \n"
"SUB %[Key], %[Key], #64 \n"
/* single block */
"B AESCTR256Block2 \n"
"AESCTR256Block: \n"
"VLD1.32 {q12}, [%[input]]! \n"
"VLD1.32 {q11}, [%[Key]]! \n"
"VMOV.32 q0, q13 \n"
"AESE.8 q0, q1\n"
@ -3435,7 +3940,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"VADD.i32 q13, q13, q14 \n" /* add 1 to counter */
"AESE.8 q0, q4\n"
"AESMC.8 q0, q0\n"
"SUB r11, r11, #1 \n"
"AESE.8 q0, q5\n"
"AESMC.8 q0, q0\n"
"VEXT.8 q13, q13, q13, #8 \n"
@ -3453,25 +3957,19 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
"AESE.8 q0, q11\n"
"AESMC.8 q0, q0\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n"
"AESE.8 q0, q11\n" /* rnd 12 */
"AESMC.8 q0, q0\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n"
"AESE.8 q0, q11\n" /* rnd 13 */
"AESMC.8 q0, q0\n"
"VLD1.32 {q11}, [%[Key]]! \n"
"AESE.8 q0, q11\n"
"AESE.8 q0, q11\n" /* rnd 14 */
"VLD1.32 {q11}, [%[Key]] \n"
"VEOR.32 q0, q0, q11\n"
"SUB %[Key], %[Key], #64 \n"
"VEOR.32 q0, q0, q11\n" /* rnd 15 */
"#CTR operations, increment counter and xorbuf \n"
"VEOR.32 q0, q0, q12\n"
"VST1.32 {q0}, [%[out]]! \n"
"CMP r11, #0 \n"
"BEQ AESCTR256end \n"
"VLD1.32 {q12}, [%[input]]! \n"
"B AESCTR256Block \n"
"AESCTR256end: \n"
"#store current counter qalue at the end \n"
"VST1.32 {q13}, [%[regOut]] \n"
@ -3511,48 +4009,66 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
#endif /* WOLFSSL_AES_COUNTER */
#ifdef HAVE_AESGCM
/*
* from GCM implementation in wolfcrypt/src/aes.c
* Uses Karatsuba algorithm. Reduction algorithm is based on "Implementing GCM
* on ARMv8". Shifting left to account for bit reflection is based on
* "Carry-Less Multiplication and Its Usage for Computing the GCM mode"
*/
static INLINE void RIGHTSHIFTX(byte* x)
{
int i;
int carryOut = 0;
int carryIn = 0;
int borrow = x[15] & 0x01;
for (i = 0; i < AES_BLOCK_SIZE; i++) {
carryOut = x[i] & 0x01;
x[i] = (x[i] >> 1) | (carryIn ? 0x80 : 0);
carryIn = carryOut;
}
if (borrow) x[0] ^= 0xE1;
}
static void GMULT(byte* X, byte* Y)
{
byte Z[AES_BLOCK_SIZE];
byte V[AES_BLOCK_SIZE];
int i, j;
__asm__ __volatile__ (
"VLD1.32 {q0}, [%[x]] \n"
XMEMSET(Z, 0, AES_BLOCK_SIZE);
XMEMCPY(V, X, AES_BLOCK_SIZE);
for (i = 0; i < AES_BLOCK_SIZE; i++)
{
byte y = Y[i];
for (j = 0; j < 8; j++)
{
if (y & 0x80) {
xorbuf(Z, V, AES_BLOCK_SIZE);
}
/* In GCM format bits are big endian, switch location of bytes to
* allow for logical shifts and carries.
*/
"VREV64.8 q0, q0 \n"
"VLD1.32 {q1}, [%[y]] \n" /* converted on set key */
"VSWP.8 d0, d1 \n"
RIGHTSHIFTX(V);
y = y << 1;
}
}
XMEMCPY(X, Z, AES_BLOCK_SIZE);
"VMULL.p64 q5, d0, d2 \n"
"VMULL.p64 q6, d1, d3 \n"
"VEOR d15, d2, d3 \n"
"VEOR d14, d0, d1 \n"
"VMULL.p64 q7, d15, d14 \n"
"VEOR q7, q5 \n"
"VEOR q7, q6 \n"
"VEOR d11, d14 \n"
"VEOR d12, d15\n"
/* shift to left by 1 to account for reflection */
"VMOV q7, q6 \n"
"VSHL.u64 q6, q6, #1 \n"
"VSHR.u64 q7, q7, #63 \n"
"VEOR d13, d14 \n"
"VMOV q8, q5 \n"
"VSHL.u64 q5, q5, #1 \n"
"VSHR.u64 q8, q8, #63 \n"
"VEOR d12, d17 \n"
"VEOR d11, d16 \n"
/* create constant 0xc200000000000000 */
"VMOV.i32 d16, 0xc2000000 \n"
"VSHL.u64 d16, d16, #32 \n"
/* reduce product of multiplication */
"VMULL.p64 q9, d10, d16 \n"
"VEOR d11, d18 \n"
"VEOR d12, d19 \n"
"VMULL.p64 q9, d11, d16 \n"
"VEOR q6, q9 \n"
"VEOR q10, q5, q6 \n"
/* convert to GCM format */
"VREV64.8 q10, q10 \n"
"VSWP.8 d20, d21 \n"
"VST1.32 {q10}, [%[xOut]] \n"
: [xOut] "=r" (X), [yOut] "=r" (Y)
: [x] "0" (X), [y] "1" (Y)
:
);
}
@ -3639,6 +4155,16 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
byte scratch[AES_BLOCK_SIZE];
ctr = counter ;
/* sanity checks */
if (aes == NULL || (iv == NULL && ivSz > 0) ||
(authTag == NULL && authTagSz > 0) ||
(authIn == NULL && authInSz > 0) ||
(in == NULL && sz > 0) ||
(out == NULL && authTag == NULL)) {
WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
return BAD_FUNC_ARG;
}
XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
if (ivSz == NONCE_SZ) {
XMEMCPY(initialCounter, iv, ivSz);
@ -3668,7 +4194,12 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
wc_AesEncrypt(aes, initialCounter, scratch);
if (authTagSz > AES_BLOCK_SIZE) {
xorbuf(authTag, scratch, AES_BLOCK_SIZE);
}
else {
xorbuf(authTag, scratch, authTagSz);
}
return 0;
}
@ -4059,6 +4590,19 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
: "cc", "memory"
);
}
#else
{
word32* pt = (word32*)aes->H;
__asm__ volatile (
"VLD1.32 {q0}, [%[h]] \n"
"VREV64.8 q0, q0 \n"
"VSWP.8 d0, d1 \n"
"VST1.32 {q0}, [%[out]] \n"
: [out] "=r" (pt)
: [h] "0" (pt)
: "cc", "memory"
);
}
#endif
}

View File

@ -151,135 +151,133 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
/* begining of SHA256 block operation */
"sha256Start:\n"
/* Round 1 */
"MOV v4.16b, v0.16b \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 2 */
"SHA256SU0 v4.4s, v1.4s \n"
"ADD v0.4s, v1.4s, v17.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 3 */
"SHA256SU0 v1.4s, v2.4s \n"
"ADD v0.4s, v2.4s, v18.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v1.4s, v3.4s, v4.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v4.4s, v1.4s \n"
/* Round 4 */
"SHA256SU0 v2.4s, v3.4s \n"
"ADD v0.4s, v3.4s, v19.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v5.16b, v1.16b \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256SU1 v2.4s, v4.4s, v1.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v5.4s, v2.4s \n"
/* Round 5 */
"SHA256SU0 v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v20.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"SHA256SU1 v3.4s, v1.4s, v2.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v6.4s, v3.4s \n"
"ADD v0.4s, v5.4s, v21.4s \n"
/* Round 6 */
"SHA256SU0 v4.4s, v1.4s \n"
"ADD v0.4s, v1.4s, v21.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v7.4s, v4.4s \n"
"ADD v0.4s, v6.4s, v22.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v8.4s, v5.4s \n"
"ADD v0.4s, v7.4s, v23.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v9.4s, v6.4s \n"
"ADD v0.4s, v8.4s, v24.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v10.4s, v7.4s \n"
"ADD v0.4s, v9.4s, v25.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"ADD v0.4s, v10.4s, v26.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Re-use of registers is needed in order to not overwrite
* previous digest value. */
"#move to lower register and handle last rounds 11-15 \n"
"MOV v4.16b, v7.16b \n"
"MOV v1.16b, v8.16b \n"
"MOV v2.16b, v9.16b \n"
"MOV v3.16b, v10.16b \n"
"MOV v5.16b, v8.16b \n"
"SHA256SU0 v4.4s, v1.4s \n" /* 4 -> 11 */
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v27.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v6.16b, v2.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v6.4s, v3.4s \n"
"ADD v0.4s, v5.4s, v28.4s \n"
/* Round 7 */
"SHA256SU0 v1.4s, v2.4s \n"
"ADD v0.4s, v2.4s, v22.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"SHA256SU1 v1.4s, v3.4s, v4.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v7.4s, v4.4s \n"
"ADD v0.4s, v6.4s, v29.4s \n"
/* Round 8 */
"SHA256SU0 v2.4s, v3.4s \n"
"ADD v0.4s, v3.4s, v23.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"SHA256SU1 v2.4s, v4.4s, v1.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v8.4s, v5.4s \n"
"ADD v0.4s, v7.4s, v30.4s \n"
/* Round 9 */
"SHA256SU0 v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v24.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"SHA256SU1 v3.4s, v1.4s, v2.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"ADD v0.4s, v8.4s, v31.4s \n"
/* Round 10 */
"SHA256SU0 v4.4s, v1.4s \n"
"ADD v0.4s, v1.4s, v25.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 11 */
"SHA256SU0 v1.4s, v2.4s \n"
"ADD v0.4s, v2.4s, v26.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v1.4s, v3.4s, v4.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 12 */
"SHA256SU0 v2.4s, v3.4s \n"
"ADD v0.4s, v3.4s, v27.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v2.4s, v4.4s, v1.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 13 */
"SHA256SU0 v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v28.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256SU1 v3.4s, v1.4s, v2.4s \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 14 */
"ADD v0.4s, v1.4s, v29.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 15 */
"ADD v0.4s, v2.4s, v30.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Round 16 */
"ADD v0.4s, v3.4s, v31.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"#Add working vars back into digest state \n"
"SUB w8, w8, #1 \n"
"ADD v12.4s, v12.4s, v14.4s \n"
"ADD v13.4s, v13.4s, v15.4s \n"
"#check if more blocks should be done\n"
"SUB w8, w8, #1 \n"
"CBZ w8, sha256End \n"
"#load in message and schedual updates \n"
@ -301,7 +299,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
[blocks] "2" (numBlocks), [dataIn] "3" (data)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15"
"v15", "w8"
);
AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks);
@ -691,20 +689,17 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
add = (len + sha256->buffLen) - numBlocks * SHA256_BLOCK_SIZE;
__asm__ volatile (
"#load leftover data\n"
"VLD1.32 {q0}, [%[buffer]]! \n"
"VLD1.32 {q1}, [%[buffer]]! \n"
"VLD1.32 {q2}, [%[buffer]]! \n"
"VLD1.32 {q3}, [%[buffer]] \n"
"VLDM %[buffer]!, {q0-q3} \n"
"#load current digest\n"
"VLD1.32 {q12}, [%[digest]]! \n"
"VLD1.32 {q13}, [%[digest]] \n"
"SUB %[digest], %[digest], #16 \n"
"VLDM %[digest], {q12-q13} \n"
"MOV r8, %r[blocks] \n"
"VREV32.8 q0, q0 \n"
"VREV32.8 q1, q1 \n"
"VREV32.8 q2, q2 \n"
"VREV32.8 q3, q3 \n"
"VLDM %[k]! ,{q5-q8} \n"
"VLDM %[k]! ,{q9}\n"
"VMOV.32 q14, q12 \n" /* store digest for add at the end */
"VMOV.32 q15, q13 \n"
@ -713,7 +708,6 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
"sha256Start:\n"
/* Round 1 */
"VLD1.32 {q5}, [%[k]]! \n"
"VMOV.32 q4, q0 \n"
"VADD.i32 q0, q0, q5 \n"
"VMOV.32 q11, q12 \n"
@ -721,142 +715,127 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
"SHA256H2.32 q13, q11, q0 \n"
/* Round 2 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 3 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 4 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q3, q5 \n"
"VADD.i32 q0, q1, q6 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q6, q1 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 3 */
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q7 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 4 */
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q8 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 5 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q7, q2 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 6 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q8, q3 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 7 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q8, q4 \n"
"VADD.i32 q0, q7, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV q9, q4 \n"
"SHA256SU1.32 q8, q6, q7 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q9, q6 \n"
"VADD.i32 q0, q8, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q10, q6 \n"
"SHA256SU1.32 q9, q7, q8 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q10, q7 \n"
"VMOV.32 q1, q7 \n"
"VADD.i32 q0, q9, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q10, q8, q9 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q8 \n"
"VMOV.32 q2, q8 \n"
"VADD.i32 q0, q10, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q9, q10 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q9 \n"
"VMOV.32 q3, q9 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q10, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q10 \n"
"VMOV.32 q4, q10 \n"
"VADD.i32 q0, q2, q5 \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q9 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
"VLD1.32 {q5}, [%[k]]! \n"
/* Round 6 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VMOV.32 q6, q1 \n"
"VADD.i32 q0, q3, q5 \n"
"VADD.i32 q0, q1, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"VMOV.32 q7, q2 \n"
"VADD.i32 q0, q4, q5 \n"
/* Round 7 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q1, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
"VLD1.32 {q10}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
"VLD1.32 {q10}, [%[k]]! \n"
"VADD.i32 q0, q1, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 15 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VLD1.32 {q10}, [%[k]]! \n"
"VADD.i32 q0, q2, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 16 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q7, q5 \n"
"VLD1.32 {q10}, [%[k]] \n"
"SUB r8, r8, #1 \n"
"VADD.i32 q0, q3, q10 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
@ -866,7 +845,6 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
"VADD.i32 q13, q13, q15 \n"
"#check if more blocks should be done\n"
"SUB r8, r8, #1 \n"
"CMP r8, #0 \n"
"BEQ sha256End \n"
@ -877,7 +855,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
"VLD1.32 {q3}, [%[dataIn]]! \n"
/* reset K pointer */
"SUB %[k], %[k], #256 \n"
"SUB %[k], %[k], #160 \n"
"VREV32.8 q0, q0 \n"
"VREV32.8 q1, q1 \n"
"VREV32.8 q2, q2 \n"
@ -895,7 +873,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
[blocks] "2" (numBlocks), [dataIn] "3" (data)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14",
"q15"
"q15", "r8"
);
AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks);
@ -930,15 +908,10 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
__asm__ volatile (
"#load leftover data\n"
"VLD1.32 {q0}, [%[buffer]]! \n"
"VLD1.32 {q1}, [%[buffer]]! \n"
"VLD1.32 {q2}, [%[buffer]]! \n"
"VLD1.32 {q3}, [%[buffer]] \n"
"VLDM %[buffer]!, {q0-q3} \n"
"#load current digest\n"
"VLD1.32 {q12}, [%[digest]]! \n"
"VLD1.32 {q13}, [%[digest]] \n"
"SUB %[digest], %[digest], #16 \n"
"VLDM %[digest], {q12-q13} \n"
"VREV32.8 q0, q0 \n"
"VREV32.8 q1, q1 \n"
"VREV32.8 q2, q2 \n"
@ -949,7 +922,6 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
"VMOV.32 q15, q13 \n"
/* begining of SHA256 block operation */
/* Round 1 */
"VLD1.32 {q5}, [%[k]]! \n"
"VMOV.32 q4, q0 \n"
@ -960,141 +932,129 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
/* Round 2 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 3 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 4 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q6, q1 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 5 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q7, q2 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 6 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q8, q3 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 7 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q8, q4 \n"
"VADD.i32 q0, q7, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV q9, q4 \n"
"SHA256SU1.32 q8, q6, q7 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q9, q6 \n"
"VADD.i32 q0, q8, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q10, q6 \n"
"SHA256SU1.32 q9, q7, q8 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q10, q7 \n"
"VMOV.32 q1, q7 \n"
"VADD.i32 q0, q9, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q10, q8, q9 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q8 \n"
"VMOV.32 q2, q8 \n"
"VADD.i32 q0, q10, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q9, q10 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q9 \n"
"VMOV.32 q3, q9 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q10, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q10 \n"
"VMOV.32 q4, q10 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
/* Round 6 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VMOV.32 q6, q1 \n"
"VADD.i32 q0, q3, q5 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
/* Round 7 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"VMOV.32 q7, q2 \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 15 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 16 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q7, q5 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
@ -1153,21 +1113,15 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
word32* digPt = sha256->digest;
__asm__ volatile (
"#load leftover data\n"
"VLD1.32 {q0}, [%[buffer]]! \n"
"VLD1.32 {q1}, [%[buffer]]! \n"
"VLD1.32 {q2}, [%[buffer]]! \n"
"VLD1.32 {q3}, [%[buffer]] \n"
"VLDM %[buffer]!, {q0-q3} \n"
"#load current digest\n"
"VLD1.32 {q12}, [%[digest]]! \n"
"VLD1.32 {q13}, [%[digest]] \n"
"SUB %[digest], %[digest], #16 \n"
"VLDM %[digest], {q12-q13} \n"
"VMOV.32 q14, q12 \n" /* store digest for add at the end */
"VMOV.32 q15, q13 \n"
/* begining of SHA256 block operation */
/* Round 1 */
"VLD1.32 {q5}, [%[k]]! \n"
"VMOV.32 q4, q0 \n"
@ -1178,141 +1132,129 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
/* Round 2 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 3 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 4 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q6, q1 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 5 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q7, q2 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 6 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q8, q3 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 7 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q8, q4 \n"
"VADD.i32 q0, q7, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV q9, q4 \n"
"SHA256SU1.32 q8, q6, q7 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q9, q6 \n"
"VADD.i32 q0, q8, q5 \n"
"VMOV.32 q11, q12 \n"
"VMOV.32 q10, q6 \n"
"SHA256SU1.32 q9, q7, q8 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q10, q7 \n"
"VMOV.32 q1, q7 \n"
"VADD.i32 q0, q9, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q10, q8, q9 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q8 \n"
"VMOV.32 q2, q8 \n"
"VADD.i32 q0, q10, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q9, q10 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q9 \n"
"VMOV.32 q3, q9 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q10, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q10 \n"
"VMOV.32 q4, q10 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
/* Round 6 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VMOV.32 q6, q1 \n"
"VADD.i32 q0, q3, q5 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
/* Round 7 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q6, q2 \n"
"VMOV.32 q7, q2 \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 8 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 9 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q6, q3, q4 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 10 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q4, q1 \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q4, q2, q3 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 11 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q1, q2 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q1, q3, q4 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 12 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q2, q3 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q2, q4, q1 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 13 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q3, q4 \n"
"VADD.i32 q0, q4, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q3, q1, q2 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 14 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q1, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 15 */
"VLD1.32 {q5}, [%[k]]! \n"
"SHA256SU0.32 q7, q3 \n"
"VADD.i32 q0, q6, q5 \n"
"VADD.i32 q0, q2, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256SU1.32 q7, q4, q6 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"
/* Round 16 */
"VLD1.32 {q5}, [%[k]]! \n"
"VADD.i32 q0, q7, q5 \n"
"VADD.i32 q0, q3, q5 \n"
"VMOV.32 q11, q12 \n"
"SHA256H.32 q12, q13, q0 \n"
"SHA256H2.32 q13, q11, q0 \n"