Merge pull request #543 from JacobBarthelmeh/ARMv8

ARMv8 : increase performance with SHA256
This commit is contained in:
toddouska
2016-09-09 10:23:44 -07:00
committed by GitHub

View File

@@ -108,161 +108,206 @@ static INLINE void AddLength(Sha256* sha256, word32 len)
/* ARMv8 hardware accleration */ /* ARMv8 hardware accleration */
int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
{ {
word32 add;
word32 numBlocks;
/* do block size increments */ if (sha256 == NULL || (data == NULL && len != 0)) {
byte* local = (byte*)sha256->buffer; return BAD_FUNC_ARG;
}
while (len) { /* fill leftover buffer with data */
word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
XMEMCPY(&local[sha256->buffLen], data, add); XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
sha256->buffLen += add;
data += add;
len -= add;
sha256->buffLen += add; /* number of blocks in a row to complete */
data += add; numBlocks = (len + sha256->buffLen)/SHA256_BLOCK_SIZE;
len -= add;
if (sha256->buffLen == SHA256_BLOCK_SIZE) { if (numBlocks > 0) {
word32* Kpt = (word32*)K; /* get leftover amount after blocks */
word32* bufferPt = sha256->buffer; add = (len + sha256->buffLen) - numBlocks * SHA256_BLOCK_SIZE;
word32* digestPt = sha256->digest; __asm__ volatile (
"#load leftover data\n"
"LD1 {v0.2d-v3.2d}, %[buffer] \n"
__asm__ volatile ( "#load current digest\n"
"#load in message and schedual updates \n" "LD1 {v12.2d-v13.2d}, %[digest] \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n" "MOV w8, %w[blocks] \n"
"MOV v0.16b, v4.16b \n" "REV32 v0.16b, v0.16b \n"
"MOV v1.16b, v5.16b \n" "REV32 v1.16b, v1.16b \n"
"REV32 v0.16b, v0.16b \n" "REV32 v2.16b, v2.16b \n"
"MOV v2.16b, v6.16b \n" "REV32 v3.16b, v3.16b \n"
"REV32 v1.16b, v1.16b \n"
"MOV v3.16b, v7.16b \n"
"REV32 v2.16b, v2.16b \n"
"REV32 v3.16b, v3.16b \n"
"MOV v4.16b, v0.16b \n" "#load K values in \n"
"MOV v5.16b, v1.16b \n" "LD1 {v16.4s-v19.4s}, [%[k]], #64 \n"
"SHA256SU0 v4.4s, v1.4s \n" "LD1 {v20.4s-v23.4s}, [%[k]], #64 \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n" "MOV v14.16b, v12.16b \n" /* store digest for add at the end */
"MOV v6.16b, v2.16b \n" "MOV v15.16b, v13.16b \n"
"SHA256SU0 v5.4s, v2.4s \n" "LD1 {v24.4s-v27.4s}, [%[k]], #64 \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n" "LD1 {v28.4s-v31.4s}, [%[k]], #64 \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n" /* begining of SHA256 block operation */
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n" "sha256Start:\n"
"ADD v0.4s, v0.4s, v16.4s \n" "MOV v4.16b, v0.16b \n"
"ADD v1.4s, v1.4s, v17.4s \n" "ADD v0.4s, v0.4s, v16.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n" "MOV v11.16b, v12.16b \n"
"ADD v3.4s, v3.4s, v19.4s \n" "SHA256H q12, q13, v0.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n" "SHA256H2 q13, q11, v0.4s \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n" "ADD v0.4s, v1.4s, v17.4s \n"
"MOV v16.16b, v20.16b \n" "MOV v11.16b, v12.16b \n"
"MOV v17.16b, v21.16b \n" "SHA256H q12, q13, v0.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256H2 q13, q11, v0.4s \n"
"SHA256H q16, q17, v0.4s \n"
"SHA256H2 q17, q18, v0.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v1.4s \n"
"SHA256H2 q17, q18, v1.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v2.4s \n"
"SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v3.4s \n"
"SHA256H2 q17, q18, v3.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v4.4s \n"
"SHA256H2 q17, q18, v4.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n" "ADD v0.4s, v2.4s, v18.4s \n"
"ADD v16.4s, v16.4s, v20.4s \n" "MOV v11.16b, v12.16b \n"
"ADD v17.4s, v17.4s, v21.4s \n" "SHA256H q12, q13, v0.4s \n"
"STP q16, q17, [%[out]] \n" "SHA256H2 q13, q11, v0.4s \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt)
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt) "SHA256SU0 v4.4s, v1.4s \n"
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "ADD v0.4s, v3.4s, v19.4s \n"
"v15", "v16", "v17", "v18", "v19", "v20", "v21" "MOV v11.16b, v12.16b \n"
"MOV v5.16b, v1.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v20.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v6.16b, v2.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"ADD v0.4s, v5.4s, v21.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v7.16b, v3.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"ADD v0.4s, v6.4s, v22.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v8.16b, v4.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"ADD v0.4s, v7.4s, v23.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v9.16b, v5.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"ADD v0.4s, v8.4s, v24.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v10.16b, v6.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"ADD v0.4s, v9.4s, v25.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"ADD v0.4s, v10.4s, v26.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
/* Re-use of registers is needed in order to not overwrite
* previous digest value. */
"#move to lower register and handle last rounds 11-15 \n"
"MOV v4.16b, v7.16b \n"
"MOV v1.16b, v8.16b \n"
"MOV v2.16b, v9.16b \n"
"MOV v3.16b, v10.16b \n"
"MOV v5.16b, v8.16b \n"
"SHA256SU0 v4.4s, v1.4s \n" /* 4 -> 11 */
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"ADD v0.4s, v4.4s, v27.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v6.16b, v2.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"ADD v0.4s, v5.4s, v28.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v7.16b, v3.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"ADD v0.4s, v6.4s, v29.4s \n"
"MOV v11.16b, v12.16b \n"
"MOV v8.16b, v4.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"ADD v0.4s, v7.4s, v30.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"ADD v0.4s, v8.4s, v31.4s \n"
"MOV v11.16b, v12.16b \n"
"SHA256H q12, q13, v0.4s \n"
"SHA256H2 q13, q11, v0.4s \n"
"#Add working vars back into digest state \n"
"ADD v12.4s, v12.4s, v14.4s \n"
"ADD v13.4s, v13.4s, v15.4s \n"
"#check if more blocks should be done\n"
"SUB w8, w8, #1 \n"
"CBZ w8, sha256End \n"
"#load in message and schedual updates \n"
"LD1 {v0.2d-v3.2d}, [%[dataIn]], #64 \n"
"MOV v14.16b, v12.16b \n"
"MOV v15.16b, v13.16b \n"
"REV32 v0.16b, v0.16b \n"
"REV32 v1.16b, v1.16b \n"
"REV32 v2.16b, v2.16b \n"
"REV32 v3.16b, v3.16b \n"
"B sha256Start \n" /* do another block */
"sha256End:\n"
"STP q12, q13, %[out] \n"
: [out] "=m" (sha256->digest), "=m" (sha256->buffer), "=r" (numBlocks),
"=r" (data)
: [k] "r" (K), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer),
[blocks] "2" (numBlocks), [dataIn] "3" (data)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15"
); );
AddLength(sha256, SHA256_BLOCK_SIZE); AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks);
sha256->buffLen = 0;
} /* copy over any remaining data leftover */
XMEMCPY(sha256->buffer, data, add);
sha256->buffLen = add;
} }
return 0; return 0;
@@ -271,12 +316,13 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
int wc_Sha256Final(Sha256* sha256, byte* hash) int wc_Sha256Final(Sha256* sha256, byte* hash)
{ {
byte* local = (byte*)sha256->buffer; byte* local;
word32* Kpt = (word32*)K;
word32* bufferPt = sha256->buffer; if (sha256 == NULL || hash == NULL) {
word32* digestPt = sha256->digest; return BAD_FUNC_ARG;
word32* hashPt = (word32*)hash; }
local = (byte*)sha256->buffer;
AddLength(sha256, sha256->buffLen); /* before adding pads */ AddLength(sha256, sha256->buffLen); /* before adding pads */
local[sha256->buffLen++] = 0x80; /* add 1 */ local[sha256->buffLen++] = 0x80; /* add 1 */
@@ -285,143 +331,153 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
if (sha256->buffLen > SHA256_PAD_SIZE) { if (sha256->buffLen > SHA256_PAD_SIZE) {
XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen); XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
bufferPt = sha256->buffer;
digestPt = sha256->digest;
Kpt = (word32*)K;
__asm__ volatile ( __asm__ volatile (
"#load in message and schedual updates \n" "LD1 {v4.2d-v7.2d}, %[buffer] \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n"
"MOV v0.16b, v4.16b \n" "MOV v0.16b, v4.16b \n"
"MOV v1.16b, v5.16b \n" "MOV v1.16b, v5.16b \n"
"REV32 v0.16b, v0.16b \n" "REV32 v0.16b, v0.16b \n"
"MOV v2.16b, v6.16b \n"
"REV32 v1.16b, v1.16b \n" "REV32 v1.16b, v1.16b \n"
"MOV v2.16b, v6.16b \n"
"MOV v3.16b, v7.16b \n" "MOV v3.16b, v7.16b \n"
"REV32 v2.16b, v2.16b \n" "REV32 v2.16b, v2.16b \n"
"REV32 v3.16b, v3.16b \n" "REV32 v3.16b, v3.16b \n"
"MOV v4.16b, v0.16b \n" "MOV v4.16b, v0.16b \n"
"MOV v5.16b, v1.16b \n" "MOV v5.16b, v1.16b \n"
"SHA256SU0 v4.4s, v1.4s \n" "LD1 {v20.2d-v21.2d}, %[digest] \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"ADD v1.4s, v1.4s, v17.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n"
"ADD v3.4s, v3.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n" "#SHA256 operation on updated message \n"
"MOV v16.16b, v20.16b \n" "MOV v16.16b, v20.16b \n"
"MOV v17.16b, v21.16b \n" "MOV v17.16b, v21.16b \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v0.4s \n" "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256H2 q17, q18, v0.4s \n" "SHA256SU0 v4.4s, v1.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"SHA256H q16, q17, v1.4s \n" "ADD v0.4s, v0.4s, v22.4s \n"
"SHA256H2 q17, q18, v1.4s \n" "MOV v6.16b, v2.16b \n"
"MOV v18.16b, v16.16b \n" "MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v2.4s \n" "SHA256H q16, q17, v0.4s \n"
"SHA256H2 q17, q18, v2.4s \n" "SHA256H2 q17, q18, v0.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v3.4s \n" "SHA256SU0 v5.4s, v2.4s \n"
"SHA256H2 q17, q18, v3.4s \n" "SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v18.16b, v16.16b \n" "ADD v1.4s, v1.4s, v23.4s \n"
"SHA256H q16, q17, v4.4s \n" "MOV v18.16b, v16.16b \n"
"SHA256H2 q17, q18, v4.4s \n" "MOV v7.16b, v3.16b \n"
"MOV v18.16b, v16.16b \n" "SHA256H q16, q17, v1.4s \n"
"SHA256H q16, q17, v5.4s \n" "SHA256H2 q17, q18, v1.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU0 v6.4s, v3.4s \n"
"SHA256H q16, q17, v6.4s \n" "SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"SHA256H2 q17, q18, v6.4s \n" "ADD v2.4s, v2.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n" "MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v7.4s \n" "MOV v8.16b, v4.16b \n"
"SHA256H2 q17, q18, v7.4s \n" "SHA256H q16, q17, v2.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256H2 q17, q18, v2.4s \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n" "SHA256SU0 v7.4s, v4.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"SHA256H q16, q17, v9.4s \n" "ADD v3.4s, v3.4s, v25.4s \n"
"SHA256H2 q17, q18, v9.4s \n" "MOV v18.16b, v16.16b \n"
"MOV v18.16b, v16.16b \n" "MOV v9.16b, v5.16b \n"
"SHA256H q16, q17, v10.4s \n" "SHA256H q16, q17, v3.4s \n"
"SHA256H2 q17, q18, v10.4s \n" "SHA256H2 q17, q18, v3.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n" "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256H2 q17, q18, v11.4s \n" "SHA256SU0 v8.4s, v5.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"SHA256H q16, q17, v12.4s \n" "ADD v4.4s, v4.4s, v22.4s \n"
"SHA256H2 q17, q18, v12.4s \n" "MOV v18.16b, v16.16b \n"
"MOV v18.16b, v16.16b \n" "MOV v10.16b, v6.16b \n"
"SHA256H q16, q17, v13.4s \n" "SHA256H q16, q17, v4.4s \n"
"SHA256H2 q17, q18, v13.4s \n" "SHA256H2 q17, q18, v4.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n" "SHA256SU0 v9.4s, v6.4s \n"
"SHA256H2 q17, q18, v14.4s \n" "SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v18.16b, v16.16b \n" "ADD v5.4s, v5.4s, v23.4s \n"
"SHA256H q16, q17, v15.4s \n" "MOV v18.16b, v16.16b \n"
"SHA256H2 q17, q18, v15.4s \n" "MOV v11.16b, v7.16b \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"ADD v6.4s, v6.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v12.16b, v8.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"ADD v7.4s, v7.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v13.16b, v9.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"ADD v8.4s, v8.4s, v22.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v14.16b, v10.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"ADD v9.4s, v9.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v15.16b, v11.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"ADD v10.4s, v10.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"ADD v11.4s, v11.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"LD1 {v22.16b-v25.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v22.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"ADD v13.4s, v13.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"ADD v14.4s, v14.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"ADD v15.4s, v15.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n" "#Add working vars back into digest state \n"
"ADD v16.4s, v16.4s, v20.4s \n" "ADD v16.4s, v16.4s, v20.4s \n"
"ADD v17.4s, v17.4s, v21.4s \n" "ADD v17.4s, v17.4s, v21.4s \n"
"STP q16, q17, [%[out]] \n" "STP q16, q17, %[out] \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt) : [out] "=m" (sha256->digest)
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt) : [k] "r" (K), [digest] "m" (sha256->digest),
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", [buffer] "m" (sha256->buffer)
"v8", "v9", "v10", "v11", "v12", "v13", "v14", : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11"
"v15", "v16", "v17", "v18", "v19", "v20", "v21" , "v12", "v13", "v14", "v15", "v16", "v17", "v18"
); );
sha256->buffLen = 0; sha256->buffLen = 0;
@@ -435,23 +491,16 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
/* store lengths */ /* store lengths */
#if defined(LITTLE_ENDIAN_ORDER) #if defined(LITTLE_ENDIAN_ORDER)
bufferPt = sha256->buffer;
__asm__ volatile ( __asm__ volatile (
"LD1 {v0.16b}, [%[in]] \n" "LD1 {v0.2d-v3.2d}, %[in] \n"
"REV32 v0.16b, v0.16b \n" "REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]], #16 \n" "REV32 v1.16b, v1.16b \n"
"LD1 {v0.16b}, [%[in]] \n" "REV32 v2.16b, v2.16b \n"
"REV32 v0.16b, v0.16b \n" "REV32 v3.16b, v3.16b \n"
"ST1 {v0.16b}, [%[out]], #16 \n" "ST1 {v0.2d-v3.2d}, %[out] \n"
"LD1 {v0.16b}, [%[in]] \n" : [out] "=m" (sha256->buffer)
"REV32 v0.16b, v0.16b \n" : [in] "m" (sha256->buffer)
"ST1 {v0.16b}, [%[out]], #16 \n" : "cc", "memory"
"LD1 {v0.16b}, [%[in]] \n"
"REV32 v0.16b, v0.16b \n"
"ST1 {v0.16b}, [%[out]] \n"
: [out] "=r" (bufferPt)
: [in] "0" (bufferPt)
: "cc"
); );
#endif #endif
/* ! length ordering dependent on digest endian type ! */ /* ! length ordering dependent on digest endian type ! */
@@ -459,130 +508,138 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
sizeof(word32)); sizeof(word32));
bufferPt = sha256->buffer;
digestPt = sha256->digest;
Kpt = (word32*)K;
__asm__ volatile ( __asm__ volatile (
"#load in message and schedual updates \n" "#load in message and schedual updates \n"
"LD1 {v4.16b-v7.16b}, [%[buffer]] \n" "LD1 {v4.2d-v7.2d}, %[buffer] \n"
"MOV v0.16b, v4.16b \n" "MOV v0.16b, v4.16b \n"
"MOV v1.16b, v5.16b \n" "MOV v1.16b, v5.16b \n"
"MOV v2.16b, v6.16b \n" "MOV v2.16b, v6.16b \n"
"MOV v3.16b, v7.16b \n" "MOV v3.16b, v7.16b \n"
"SHA256SU0 v4.4s, v1.4s \n" "LD1 {v20.2d-v21.2d}, %[digest] \n"
"SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v6.16b, v2.16b \n"
"SHA256SU0 v5.4s, v2.4s \n"
"SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"MOV v7.16b, v3.16b \n"
"SHA256SU0 v6.4s, v3.4s \n"
"SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"MOV v8.16b, v4.16b \n"
"SHA256SU0 v7.4s, v4.4s \n"
"SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v9.16b, v5.16b \n"
"SHA256SU0 v8.4s, v5.4s \n"
"SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v10.16b, v6.16b \n"
"SHA256SU0 v9.4s, v6.4s \n"
"SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"MOV v11.16b, v7.16b \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"MOV v12.16b, v8.16b \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"MOV v13.16b, v9.16b \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"MOV v14.16b, v10.16b \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"MOV v15.16b, v11.16b \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"#Add K values to message \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v0.4s, v0.4s, v16.4s \n"
"ADD v1.4s, v1.4s, v17.4s \n"
"ADD v2.4s, v2.4s, v18.4s \n"
"ADD v3.4s, v3.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v4.4s, v4.4s, v16.4s \n"
"ADD v5.4s, v5.4s, v17.4s \n"
"ADD v6.4s, v6.4s, v18.4s \n"
"ADD v7.4s, v7.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]], #64 \n"
"ADD v8.4s, v8.4s, v16.4s \n"
"ADD v9.4s, v9.4s, v17.4s \n"
"ADD v10.4s, v10.4s, v18.4s \n"
"ADD v11.4s, v11.4s, v19.4s \n"
"LD1 {v16.16b-v19.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v16.4s \n"
"ADD v13.4s, v13.4s, v17.4s \n"
"LD1 {v20.4s-v21.4s}, [%[digest]] \n"
"ADD v14.4s, v14.4s, v18.4s \n"
"ADD v15.4s, v15.4s, v19.4s \n"
"#SHA256 operation on updated message \n"
"MOV v16.16b, v20.16b \n" "MOV v16.16b, v20.16b \n"
"MOV v17.16b, v21.16b \n" "MOV v17.16b, v21.16b \n"
"MOV v18.16b, v16.16b \n" "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256H q16, q17, v0.4s \n" "SHA256SU0 v4.4s, v1.4s \n"
"SHA256H2 q17, q18, v0.4s \n" "SHA256SU1 v4.4s, v2.4s, v3.4s \n"
"MOV v18.16b, v16.16b \n" "ADD v0.4s, v0.4s, v22.4s \n"
"SHA256H q16, q17, v1.4s \n" "MOV v6.16b, v2.16b \n"
"SHA256H2 q17, q18, v1.4s \n" "MOV v18.16b, v16.16b \n"
"MOV v18.16b, v16.16b \n" "SHA256H q16, q17, v0.4s \n"
"SHA256H q16, q17, v2.4s \n" "SHA256H2 q17, q18, v0.4s \n"
"SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU0 v5.4s, v2.4s \n"
"SHA256H q16, q17, v3.4s \n" "SHA256SU1 v5.4s, v3.4s, v4.4s \n"
"SHA256H2 q17, q18, v3.4s \n" "ADD v1.4s, v1.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n" "MOV v7.16b, v3.16b \n"
"SHA256H q16, q17, v4.4s \n" "MOV v18.16b, v16.16b \n"
"SHA256H2 q17, q18, v4.4s \n" "SHA256H q16, q17, v1.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256H2 q17, q18, v1.4s \n"
"SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n" "SHA256SU0 v6.4s, v3.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU1 v6.4s, v4.4s, v5.4s \n"
"SHA256H q16, q17, v6.4s \n" "ADD v2.4s, v2.4s, v24.4s \n"
"SHA256H2 q17, q18, v6.4s \n" "MOV v18.16b, v16.16b \n"
"MOV v18.16b, v16.16b \n" "MOV v8.16b, v4.16b \n"
"SHA256H q16, q17, v7.4s \n" "SHA256H q16, q17, v2.4s \n"
"SHA256H2 q17, q18, v7.4s \n" "SHA256H2 q17, q18, v2.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v8.4s \n" "SHA256SU0 v7.4s, v4.4s \n"
"SHA256H2 q17, q18, v8.4s \n" "SHA256SU1 v7.4s, v5.4s, v6.4s \n"
"MOV v18.16b, v16.16b \n" "ADD v3.4s, v3.4s, v25.4s \n"
"SHA256H q16, q17, v9.4s \n" "MOV v18.16b, v16.16b \n"
"SHA256H2 q17, q18, v9.4s \n" "MOV v9.16b, v5.16b \n"
"MOV v18.16b, v16.16b \n" "SHA256H q16, q17, v3.4s \n"
"SHA256H q16, q17, v10.4s \n" "SHA256H2 q17, q18, v3.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"MOV v18.16b, v16.16b \n" "LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256H q16, q17, v11.4s \n" "SHA256SU0 v8.4s, v5.4s \n"
"SHA256H2 q17, q18, v11.4s \n" "SHA256SU1 v8.4s, v6.4s, v7.4s \n"
"MOV v18.16b, v16.16b \n" "ADD v4.4s, v4.4s, v22.4s \n"
"SHA256H q16, q17, v12.4s \n" "MOV v18.16b, v16.16b \n"
"SHA256H2 q17, q18, v12.4s \n" "MOV v10.16b, v6.16b \n"
"MOV v18.16b, v16.16b \n" "SHA256H q16, q17, v4.4s \n"
"SHA256H q16, q17, v13.4s \n" "SHA256H2 q17, q18, v4.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"MOV v18.16b, v16.16b \n" "SHA256SU0 v9.4s, v6.4s \n"
"SHA256H q16, q17, v14.4s \n" "SHA256SU1 v9.4s, v7.4s, v8.4s \n"
"SHA256H2 q17, q18, v14.4s \n" "ADD v5.4s, v5.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n" "MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n" "MOV v11.16b, v7.16b \n"
"SHA256H2 q17, q18, v15.4s \n" "SHA256H q16, q17, v5.4s \n"
"SHA256H2 q17, q18, v5.4s \n"
"SHA256SU0 v10.4s, v7.4s \n"
"SHA256SU1 v10.4s, v8.4s, v9.4s \n"
"ADD v6.4s, v6.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v12.16b, v8.16b \n"
"SHA256H q16, q17, v6.4s \n"
"SHA256H2 q17, q18, v6.4s \n"
"SHA256SU0 v11.4s, v8.4s \n"
"SHA256SU1 v11.4s, v9.4s, v10.4s \n"
"ADD v7.4s, v7.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v13.16b, v9.16b \n"
"SHA256H q16, q17, v7.4s \n"
"SHA256H2 q17, q18, v7.4s \n"
"LD1 {v22.16b-v25.16b}, [%[k]], #64 \n"
"SHA256SU0 v12.4s, v9.4s \n"
"SHA256SU1 v12.4s, v10.4s, v11.4s \n"
"ADD v8.4s, v8.4s, v22.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v14.16b, v10.16b \n"
"SHA256H q16, q17, v8.4s \n"
"SHA256H2 q17, q18, v8.4s \n"
"SHA256SU0 v13.4s, v10.4s \n"
"SHA256SU1 v13.4s, v11.4s, v12.4s \n"
"ADD v9.4s, v9.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n"
"MOV v15.16b, v11.16b \n"
"SHA256H q16, q17, v9.4s \n"
"SHA256H2 q17, q18, v9.4s \n"
"SHA256SU0 v14.4s, v11.4s \n"
"SHA256SU1 v14.4s, v12.4s, v13.4s \n"
"ADD v10.4s, v10.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v10.4s \n"
"SHA256H2 q17, q18, v10.4s \n"
"SHA256SU0 v15.4s, v12.4s \n"
"SHA256SU1 v15.4s, v13.4s, v14.4s \n"
"ADD v11.4s, v11.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v11.4s \n"
"SHA256H2 q17, q18, v11.4s \n"
"LD1 {v22.16b-v25.16b}, [%[k]] \n"
"ADD v12.4s, v12.4s, v22.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v12.4s \n"
"SHA256H2 q17, q18, v12.4s \n"
"ADD v13.4s, v13.4s, v23.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v13.4s \n"
"SHA256H2 q17, q18, v13.4s \n"
"ADD v14.4s, v14.4s, v24.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v14.4s \n"
"SHA256H2 q17, q18, v14.4s \n"
"ADD v15.4s, v15.4s, v25.4s \n"
"MOV v18.16b, v16.16b \n"
"SHA256H q16, q17, v15.4s \n"
"SHA256H2 q17, q18, v15.4s \n"
"#Add working vars back into digest state \n" "#Add working vars back into digest state \n"
"ADD v16.4s, v16.4s, v20.4s \n" "ADD v16.4s, v16.4s, v20.4s \n"
"ADD v17.4s, v17.4s, v21.4s \n" "ADD v17.4s, v17.4s, v21.4s \n"
"STP q16, q17, [%[out]] \n"
"#Store value as hash output \n" "#Store value as hash output \n"
#if defined(LITTLE_ENDIAN_ORDER) #if defined(LITTLE_ENDIAN_ORDER)
@@ -593,12 +650,13 @@ int wc_Sha256Final(Sha256* sha256, byte* hash)
"REV32 v17.16b, v17.16b \n" "REV32 v17.16b, v17.16b \n"
#endif #endif
"ST1 {v17.16b}, [%[hashOut]] \n" "ST1 {v17.16b}, [%[hashOut]] \n"
: "=r" (Kpt), [out] "=r" (digestPt), "=r" (bufferPt), : [hashOut] "=r" (hash)
[hashOut] "=r" (hashPt) : [k] "r" (K), [digest] "m" (sha256->digest),
: [k] "0" (Kpt), [digest] "1" (digestPt), [buffer] "2" (bufferPt), "3" (hashPt) [buffer] "m" (sha256->buffer),
"0" (hash)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
"v15", "v16", "v17", "v18", "v19", "v20", "v21" "v15", "v16", "v17", "v18"
); );
return wc_InitSha256(sha256); /* reset state */ return wc_InitSha256(sha256); /* reset state */