diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 9cc2a3a09..62538e7e4 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -485,8 +485,8 @@ int wc_InitAes_h(Aes* aes, void* h) "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" "LD1 {v9.2d-v11.2d},%[Key], #48 \n" "LD1 {v0.2d}, %[reg] \n" - "LD1 {v12.2d}, [%[input]], #16 \n" + "LD1 {v12.2d}, [%[input]], #16 \n" "AESCBC128Block:\n" "#CBC operations, xorbuf in with current aes->reg \n" "EOR v0.16b, v0.16b, v12.16b \n" @@ -509,8 +509,8 @@ int wc_InitAes_h(Aes* aes, void* h) "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n" "SUB w11, w11, #1 \n" + "EOR v0.16b, v0.16b, v11.16b \n" "ST1 {v0.2d}, [%[out]], #16 \n" "CBZ w11, AESCBC128end \n" @@ -525,7 +525,7 @@ int wc_InitAes_h(Aes* aes, void* h) :"0" (out), [Key] "m" (aes->key), [input] "r" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12" + "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" ); break; @@ -894,7 +894,7 @@ int wc_InitAes_h(Aes* aes, void* h) __asm__ __volatile__ ( "MOV w11, %w[blocks] \n" "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - + "#Create vector with the value 1 \n" "MOVI v15.16b, #1 \n" "USHR v15.2d, v15.2d, #56 \n" @@ -905,8 +905,86 @@ int wc_InitAes_h(Aes* aes, void* h) "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" "LD1 {v13.2d}, %[reg] \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "AESCTR128Block: \n" + /* double block */ + "AESCTR128Block2: \n" + "CMP w11, #1 \n" + "BEQ AESCTR128Block \n" + "CMP w11, #0 \n" + "BEQ AESCTRend \n" + + "MOV v0.16b, v13.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v13.16b, v13.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v15.2d, v13.2d, v14.2d \n" /* add 1 to counter */ + "ADD v13.2d, v15.2d, v14.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* revert from network order */ + "REV64 v13.16b, v13.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v1.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v2.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v3.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v4.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v15.16b, v5.16b \n" + "AESMC v15.16b, v15.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESE v15.16b, v6.16b \n" + "AESMC v15.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v11.16b \n" + "AESE v15.16b, v7.16b \n" + "AESMC v15.16b, v15.16b \n" + + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v15.16b, v8.16b \n" + "AESMC v15.16b, v15.16b \n" + + "EOR v0.16b, v0.16b, v12.16b \n" + "AESE v15.16b, v9.16b \n" + "AESMC v15.16b, v15.16b \n" + + "LD1 {v12.2d}, [%[input]], #16 \n" + "AESE v15.16b, v10.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v15.16b, v15.16b, v11.16b \n" + "EOR v15.16b, v15.16b, v12.16b \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "B AESCTR128Block2 \n" + + /* single block */ + "AESCTR128Block: \n" "MOV v0.16b, v13.16b \n" "AESE v0.16b, v1.16b \n" "AESMC v0.16b, v0.16b \n" @@ -935,18 +1013,15 @@ int wc_InitAes_h(Aes* aes, void* h) "AESE v0.16b, v10.16b \n" "EOR v0.16b, v0.16b, v11.16b \n" "#CTR operations, increment counter and xorbuf \n" + "LD1 {v12.2d}, [%[input]], #16 \n" "EOR v0.16b, v0.16b, v12.16b \n" "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, AESCTRend \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "B AESCTR128Block \n" - + "AESCTRend: \n" "#store current counter value at the end \n" "ST1 {v13.2d}, %[regOut] \n" - - :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg), + + :[out] "=r" (out), "=r" (keyPt), [regOut] "=m" (aes->reg), "=r" (in) :"0" (out), [Key] "1" (keyPt), [input] "3" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) @@ -970,9 +1045,97 @@ int wc_InitAes_h(Aes* aes, void* h) "LD1 {v9.2d-v12.2d}, [%[Key]], #64\n" "LD1 {v15.2d}, %[reg] \n" "LD1 {v13.16b}, [%[Key]], #16 \n" - "LD1 {v14.2d}, [%[input]], #16 \n" + + /* double block */ + "AESCTR192Block2: \n" + "CMP w11, #1 \n" + "BEQ AESCTR192Block \n" + "CMP w11, #0 \n" + "BEQ AESCTR192end \n" + + "MOV v0.16b, v15.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v15.16b, v15.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v17.2d, v15.2d, v16.2d \n" /* add 1 to counter */ + "ADD v15.2d, v17.2d, v16.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + "REV64 v15.16b, v15.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v1.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v2.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v3.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v4.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v5.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v6.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v17.16b, v7.16b \n" + "AESMC v17.16b, v17.16b \n" + + "AESE v0.16b, v12.16b \n" + "AESE v17.16b, v8.16b \n" + "AESMC v17.16b, v17.16b \n" + + "EOR v0.16b, v0.16b, v13.16b \n" + "AESE v17.16b, v9.16b \n" + "AESMC v17.16b, v17.16b \n" + + "LD1 {v14.2d}, [%[input]], #16 \n" + "AESE v17.16b, v10.16b \n" + "AESMC v17.16b, v17.16b \n" + + "EOR v0.16b, v0.16b, v14.16b \n" + "AESE v17.16b, v11.16b \n" + "AESMC v17.16b, v17.16b \n" + + "LD1 {v14.2d}, [%[input]], #16 \n" + "AESE v17.16b, v12.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v17.16b, v17.16b, v13.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "ST1 {v17.2d}, [%[out]], #16 \n" + + "B AESCTR192Block2 \n" "AESCTR192Block: \n" + "LD1 {v14.2d}, [%[input]], #16 \n" "MOV v0.16b, v15.16b \n" "AESE v0.16b, v1.16b \n" @@ -1009,10 +1172,6 @@ int wc_InitAes_h(Aes* aes, void* h) "EOR v0.16b, v0.16b, v14.16b \n" "ST1 {v0.2d}, [%[out]], #16 \n" - "CBZ w11, AESCTR192end \n" - "LD1 {v14.2d}, [%[input]], #16 \n" - "B AESCTR192Block \n" - "AESCTR192end: \n" "#store current counter value at the end \n" "ST1 {v15.2d}, %[regOut] \n" @@ -1023,7 +1182,7 @@ int wc_InitAes_h(Aes* aes, void* h) [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", - "v16" + "v16", "v17" ); break; @@ -1043,8 +1202,106 @@ int wc_InitAes_h(Aes* aes, void* h) "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" "LD1 {v17.2d}, %[reg] \n" + /* double block */ + "AESCTR256Block2: \n" + "CMP w11, #1 \n" + "BEQ AESCTR256Block \n" + "CMP w11, #0 \n" + "BEQ AESCTR256end \n" + + "MOV v0.16b, v17.16b \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v17.16b, v17.16b \n" /* network order */ + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v19.16b, v17.16b, v18.16b, #8 \n" + "SUB w11, w11, #2 \n" + "ADD v19.2d, v17.2d, v18.2d \n" /* add 1 to counter */ + "ADD v17.2d, v19.2d, v18.2d \n" /* add 1 to counter */ + + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "EXT v17.16b, v17.16b, v17.16b, #8 \n" + + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "REV64 v19.16b, v19.16b \n" /* revert from network order */ + "REV64 v17.16b, v17.16b \n" /* revert from network order */ + + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v1.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v2.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v3.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v4.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v5.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v6.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v7.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v8.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v19.16b, v9.16b \n" + "AESMC v19.16b, v19.16b \n" + + "AESE v0.16b, v14.16b \n" + "AESE v19.16b, v10.16b \n" + "AESMC v19.16b, v19.16b \n" + + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v19.16b, v11.16b \n" + "AESMC v19.16b, v19.16b \n" + "LD1 {v16.2d}, [%[input]], #16 \n" + "AESE v19.16b, v12.16b \n" + "AESMC v19.16b, v19.16b \n" + + "EOR v0.16b, v0.16b, v16.16b \n" + "AESE v19.16b, v13.16b \n" + "AESMC v19.16b, v19.16b \n" + + "LD1 {v16.2d}, [%[input]], #16 \n" + "AESE v19.16b, v14.16b \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v19.16b, v19.16b, v16.16b \n" + "ST1 {v19.2d}, [%[out]], #16 \n" + + "B AESCTR256Block2 \n" + "AESCTR256Block: \n" + "LD1 {v16.2d}, [%[input]], #16 \n" "MOV v0.16b, v17.16b \n" "AESE v0.16b, v1.16b \n" "AESMC v0.16b, v0.16b \n" @@ -1057,7 +1314,6 @@ int wc_InitAes_h(Aes* aes, void* h) "ADD v17.2d, v17.2d, v18.2d \n" /* add 1 to counter */ "AESE v0.16b, v4.16b \n" "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" "AESE v0.16b, v5.16b \n" "AESMC v0.16b, v0.16b \n" "EXT v17.16b, v17.16b, v17.16b, #8 \n" @@ -1084,10 +1340,6 @@ int wc_InitAes_h(Aes* aes, void* h) "EOR v0.16b, v0.16b, v16.16b \n" "ST1 {v0.2d}, [%[out]], #16 \n" - "CBZ w11, AESCTR256end \n" - "LD1 {v16.2d}, [%[input]], #16 \n" - "B AESCTR256Block \n" - "AESCTR256end: \n" "#store current counter value at the end \n" "ST1 {v17.2d}, %[regOut] \n" @@ -1456,7 +1708,7 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ,[inX] "4" (xPt), [inY] "m" (aes->H) : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" ); } @@ -1546,9 +1798,13 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ); - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); - + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } return 0; } @@ -1777,7 +2033,7 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ,[inX] "4" (xPt), [inY] "m" (aes->H) : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" ); } @@ -1872,8 +2128,13 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ); - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } return 0; } @@ -2111,7 +2372,7 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ,[inX] "4" (xPt), [inY] "m" (aes->H) : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "x12" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" ); } @@ -2210,8 +2471,13 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ); - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); + if (authTagSz > AES_BLOCK_SIZE) { + XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); + } + else { + /* authTagSz can be smaller than AES_BLOCK_SIZE */ + XMEMCPY(authTag, scratch, authTagSz); + } return 0; } @@ -2244,11 +2510,6 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* authIn, word32 authInSz) { /* sanity checks */ - if (authTagSz > AES_BLOCK_SIZE) { - WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes"); - return BAD_FUNC_ARG; /* is bigger then scratch buffer */ - } - if (aes == NULL || (iv == NULL && ivSz > 0) || (authTag == NULL && authTagSz > 0) || (authIn == NULL && authInSz > 0) || @@ -2309,11 +2570,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, ctr = counter ; /* sanity checks */ - if (authTagSz > AES_BLOCK_SIZE) { - WOLFSSL_MSG("parameter authTagSz can not be larger than 16 bytes"); - return BAD_FUNC_ARG; /* is bigger then scratch buffer */ - } - if (aes == NULL || (iv == NULL && ivSz > 0) || (authTag == NULL && authTagSz > 0) || (authIn == NULL && authInSz > 0) || @@ -2780,10 +3036,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" - "VLD1.32 {q0}, [%[reg]] \n" - "VLD1.32 {q12}, [%[input]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" "AESCBC128Block:\n" "#CBC operations, xorbuf in with current aes->reg \n" @@ -2840,10 +3096,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" - "VLD1.32 {q0}, [%[reg]] \n" - "VLD1.32 {q12}, [%[input]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" "VLD1.32 {q13}, [%[Key]]! \n" "VLD1.32 {q14}, [%[Key]]! \n" @@ -2906,10 +3162,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" - "VLD1.32 {q0}, [%[reg]] \n" - "VLD1.32 {q12}, [%[input]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q0}, [%[reg]] \n" + "VLD1.32 {q12}, [%[input]]!\n" "VLD1.32 {q13}, [%[Key]]! \n" "VLD1.32 {q14}, [%[Key]]! \n" @@ -3002,10 +3258,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" "VLD1.32 {q13}, [%[reg]] \n" - "VLD1.32 {q0}, [%[input]]! \n" + "VLD1.32 {q0}, [%[input]]!\n" "AESCBC128BlockDec:\n" "VMOV.32 q12, q0 \n" @@ -3064,12 +3320,12 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" - "VLD1.32 {q12}, [%[Key]]! \n" - "VLD1.32 {q13}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q12}, [%[Key]]! \n" + "VLD1.32 {q13}, [%[Key]]! \n" "VLD1.32 {q14}, [%[reg]] \n" - "VLD1.32 {q0}, [%[input]]! \n" + "VLD1.32 {q0}, [%[input]]!\n" "AESCBC192BlockDec: \n" "VMOV.32 q15, q0 \n" @@ -3099,8 +3355,8 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VEOR.32 q0, q0, q13\n" "VEOR.32 q0, q0, q14\n" - "SUB r11, r11, #1 \n" - "VST1.32 {q0}, [%[out]]! \n" + "SUB r11, r11, #1 \n" + "VST1.32 {q0}, [%[out]]! \n" "VMOV.32 q14, q15 \n" "CMP r11, #0 \n" @@ -3132,13 +3388,13 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q7}, [%[Key]]! \n" "VLD1.32 {q8}, [%[Key]]! \n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" - "VLD1.32 {q12}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q11}, [%[Key]]! \n" + "VLD1.32 {q12}, [%[Key]]! \n" "VLD1.32 {q14}, [%[reg]] \n" - "VLD1.32 {q0}, [%[input]]! \n" + "VLD1.32 {q0}, [%[input]]!\n" - "AESCBC256BlockDec: \n" + "AESCBC256BlockDec:\n" "VMOV.32 q15, q0 \n" "AESD.8 q0, q1\n" "AESIMC.8 q0, q0\n" @@ -3246,27 +3502,93 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, case 10: /* AES 128 BLOCK */ __asm__ __volatile__ ( "MOV r11, %r[blocks] \n" - "VLD1.32 {q1}, [%[Key]]! \n" - "VLD1.32 {q2}, [%[Key]]! \n" - "VLD1.32 {q3}, [%[Key]]! \n" - "VLD1.32 {q4}, [%[Key]]! \n" + "VLDM %[Key]!, {q1-q4} \n" "#Create vector with the value 1 \n" "VMOV.u32 q15, #1 \n" "VSHR.u64 q15, q15, #32 \n" - "VLD1.32 {q5}, [%[Key]]! \n" - "VLD1.32 {q6}, [%[Key]]! \n" - "VLD1.32 {q7}, [%[Key]]! \n" - "VLD1.32 {q8}, [%[Key]]! \n" + "VLDM %[Key]!, {q5-q8} \n" "VEOR.32 q14, q14, q14 \n" + "VLDM %[Key]!, {q9-q11} \n" "VEXT.8 q14, q15, q14, #8\n" - "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" - "VLD1.32 {q11}, [%[Key]]! \n" "VLD1.32 {q13}, [%[reg]]\n" - "VLD1.32 {q12}, [%[input]]! \n" + /* double block */ + "AESCTR128Block2: \n" + "CMP r11, #1 \n" + "BEQ AESCTR128Block \n" + "CMP r11, #0 \n" + "BEQ AESCTRend \n" + + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + "VEOR.32 q0, q0, q11\n" + + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "VEOR.32 q0, q0, q12\n" + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q10\n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q11\n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + + "B AESCTR128Block2 \n" + + /* single block */ "AESCTR128Block: \n" "VMOV.32 q0, q13 \n" "AESE.8 q0, q1\n" @@ -3299,11 +3621,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VEOR.32 q0, q0, q12\n" "VST1.32 {q0}, [%[out]]! \n" - "CMP r11, #0 \n" - "BEQ AESCTRend \n" - "VLD1.32 {q12}, [%[input]]! \n" - "B AESCTR128Block \n" - "AESCTRend: \n" "#store current counter qalue at the end \n" "VST1.32 {q13}, [%[regOut]] \n" @@ -3313,7 +3630,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, :"0" (out), [Key] "1" (keyPt), [input] "3" (in), [blocks] "r" (numBlocks), [reg] "2" (regPt) : "cc", "memory", "r11", "q0", "q1", "q2", "q3", "q4", "q5", - "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14" + "q6", "q7", "q8", "q9", "q10","q11","q12","q13","q14", "q15" ); break; @@ -3336,11 +3653,102 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VEXT.8 q14, q15, q14, #8\n" "VLD1.32 {q9}, [%[Key]]! \n" - "VLD1.32 {q10}, [%[Key]]! \n" + "VLD1.32 {q10}, [%[Key]]!\n" "VLD1.32 {q13}, [%[reg]]\n" - "VLD1.32 {q12}, [%[input]]! \n" + /* double block */ + "AESCTR192Block2: \n" + "CMP r11, #1 \n" + "BEQ AESCTR192Block \n" + "CMP r11, #0 \n" + "BEQ AESCTR192end \n" + + "VMOV.32 q0, q13\n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q10\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q11\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q11\n" + "AESE.8 q15, q11\n" + + "VLD1.32 {q11}, [%[Key]] \n" + "VEOR.32 q0, q0, q11\n" + "VEOR.32 q15, q15, q11\n" + "VEOR.32 q0, q0, q12\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + "SUB %[Key], %[Key], #32 \n" + + "B AESCTR192Block2 \n" + + + /* single block */ "AESCTR192Block: \n" + "VLD1.32 {q12}, [%[input]]! \n" "VLD1.32 {q11}, [%[Key]]! \n" "VMOV.32 q0, q13 \n" "AESE.8 q0, q1\n" @@ -3375,16 +3783,10 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE.8 q0, q11\n" "VLD1.32 {q11}, [%[Key]] \n" "VEOR.32 q0, q0, q11\n" - "SUB %[Key], %[Key], #32 \n" "#CTR operations, increment counter and xorbuf \n" "VEOR.32 q0, q0, q12\n" "VST1.32 {q0}, [%[out]]! \n" - "CMP r11, #0 \n" - "BEQ AESCTR192end \n" - "VLD1.32 {q12}, [%[input]]! \n" - "B AESCTR192Block \n" - "AESCTR192end: \n" "#store current counter qalue at the end \n" "VST1.32 {q13}, [%[regOut]] \n" @@ -3420,8 +3822,111 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VLD1.32 {q10}, [%[Key]]! \n" "VLD1.32 {q13}, [%[reg]]\n" - "VLD1.32 {q12}, [%[input]]! \n" + /* double block */ + "AESCTR256Block2: \n" + "CMP r11, #1 \n" + "BEQ AESCTR256Block \n" + "CMP r11, #0 \n" + "BEQ AESCTR256end \n" + + "VMOV.32 q0, q13 \n" + "AESE.8 q0, q1\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q13, q13 \n" /* network order */ + "AESE.8 q0, q2\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q13, q13, q13, #8 \n" + "SUB r11, r11, #2 \n" + "VADD.i32 q15, q13, q14 \n" /* add 1 to counter */ + "VADD.i32 q13, q15, q14 \n" /* add 1 to counter */ + "AESE.8 q0, q3\n" + "AESMC.8 q0, q0\n" + "VEXT.8 q15, q15, q15, #8 \n" + "VEXT.8 q13, q13, q13, #8 \n" + "AESE.8 q0, q4\n" + "AESMC.8 q0, q0\n" + "VREV64.8 q15, q15\n" /* revert from network order */ + "VREV64.8 q13, q13\n" /* revert from network order */ + "AESE.8 q0, q5\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q1\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q6\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q2\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q7\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q3\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q8\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q4\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q9\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q5\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q10\n" + "AESMC.8 q0, q0\n" + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q15, q6\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q0, q11\n" + "AESMC.8 q0, q0\n" + "AESE.8 q15, q7\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q8\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q9\n" + "AESMC.8 q15, q15\n" + "VLD1.32 {q12}, [%[input]]! \n" + "AESE.8 q15, q10\n" + "AESMC.8 q15, q15\n" + + "AESE.8 q15, q11\n" + "AESMC.8 q15, q15\n" + + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q11\n" /* rnd 12*/ + "AESMC.8 q0, q0\n" + "AESE.8 q15, q11\n" /* rnd 12 */ + "AESMC.8 q15, q15\n" + + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q11\n" /* rnd 13 */ + "AESMC.8 q0, q0\n" + "AESE.8 q15, q11\n" /* rnd 13 */ + "AESMC.8 q15, q15\n" + + "VLD1.32 {q11}, [%[Key]]! \n" + "AESE.8 q0, q11\n" /* rnd 14 */ + "AESE.8 q15, q11\n" /* rnd 14 */ + + "VLD1.32 {q11}, [%[Key]] \n" + "VEOR.32 q0, q0, q11\n" /* rnd 15 */ + "VEOR.32 q15, q15, q11\n" /* rnd 15 */ + "VEOR.32 q0, q0, q12\n" + + "VLD1.32 {q12}, [%[input]]! \n" + "VST1.32 {q0}, [%[out]]! \n" + "VEOR.32 q15, q15, q12\n" + "VST1.32 {q15}, [%[out]]! \n" + "SUB %[Key], %[Key], #64 \n" + + /* single block */ + "B AESCTR256Block2 \n" + "AESCTR256Block: \n" + "VLD1.32 {q12}, [%[input]]! \n" "VLD1.32 {q11}, [%[Key]]! \n" "VMOV.32 q0, q13 \n" "AESE.8 q0, q1\n" @@ -3435,7 +3940,6 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "VADD.i32 q13, q13, q14 \n" /* add 1 to counter */ "AESE.8 q0, q4\n" "AESMC.8 q0, q0\n" - "SUB r11, r11, #1 \n" "AESE.8 q0, q5\n" "AESMC.8 q0, q0\n" "VEXT.8 q13, q13, q13, #8 \n" @@ -3453,25 +3957,19 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE.8 q0, q11\n" "AESMC.8 q0, q0\n" "VLD1.32 {q11}, [%[Key]]! \n" - "AESE.8 q0, q11\n" + "AESE.8 q0, q11\n" /* rnd 12 */ "AESMC.8 q0, q0\n" "VLD1.32 {q11}, [%[Key]]! \n" - "AESE.8 q0, q11\n" + "AESE.8 q0, q11\n" /* rnd 13 */ "AESMC.8 q0, q0\n" "VLD1.32 {q11}, [%[Key]]! \n" - "AESE.8 q0, q11\n" + "AESE.8 q0, q11\n" /* rnd 14 */ "VLD1.32 {q11}, [%[Key]] \n" - "VEOR.32 q0, q0, q11\n" - "SUB %[Key], %[Key], #64 \n" + "VEOR.32 q0, q0, q11\n" /* rnd 15 */ "#CTR operations, increment counter and xorbuf \n" "VEOR.32 q0, q0, q12\n" "VST1.32 {q0}, [%[out]]! \n" - "CMP r11, #0 \n" - "BEQ AESCTR256end \n" - "VLD1.32 {q12}, [%[input]]! \n" - "B AESCTR256Block \n" - "AESCTR256end: \n" "#store current counter qalue at the end \n" "VST1.32 {q13}, [%[regOut]] \n" @@ -3511,48 +4009,66 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, #endif /* WOLFSSL_AES_COUNTER */ #ifdef HAVE_AESGCM - /* - * from GCM implementation in wolfcrypt/src/aes.c + * Uses Karatsuba algorithm. Reduction algorithm is based on "Implementing GCM + * on ARMv8". Shifting left to account for bit reflection is based on + * "Carry-Less Multiplication and Its Usage for Computing the GCM mode" */ - -static INLINE void RIGHTSHIFTX(byte* x) -{ - int i; - int carryOut = 0; - int carryIn = 0; - int borrow = x[15] & 0x01; - - for (i = 0; i < AES_BLOCK_SIZE; i++) { - carryOut = x[i] & 0x01; - x[i] = (x[i] >> 1) | (carryIn ? 0x80 : 0); - carryIn = carryOut; - } - if (borrow) x[0] ^= 0xE1; -} - static void GMULT(byte* X, byte* Y) { - byte Z[AES_BLOCK_SIZE]; - byte V[AES_BLOCK_SIZE]; - int i, j; + __asm__ __volatile__ ( + "VLD1.32 {q0}, [%[x]] \n" - XMEMSET(Z, 0, AES_BLOCK_SIZE); - XMEMCPY(V, X, AES_BLOCK_SIZE); - for (i = 0; i < AES_BLOCK_SIZE; i++) - { - byte y = Y[i]; - for (j = 0; j < 8; j++) - { - if (y & 0x80) { - xorbuf(Z, V, AES_BLOCK_SIZE); - } + /* In GCM format bits are big endian, switch location of bytes to + * allow for logical shifts and carries. + */ + "VREV64.8 q0, q0 \n" + "VLD1.32 {q1}, [%[y]] \n" /* converted on set key */ + "VSWP.8 d0, d1 \n" - RIGHTSHIFTX(V); - y = y << 1; - } - } - XMEMCPY(X, Z, AES_BLOCK_SIZE); + "VMULL.p64 q5, d0, d2 \n" + "VMULL.p64 q6, d1, d3 \n" + "VEOR d15, d2, d3 \n" + "VEOR d14, d0, d1 \n" + "VMULL.p64 q7, d15, d14 \n" + "VEOR q7, q5 \n" + "VEOR q7, q6 \n" + "VEOR d11, d14 \n" + "VEOR d12, d15\n" + + /* shift to left by 1 to account for reflection */ + "VMOV q7, q6 \n" + "VSHL.u64 q6, q6, #1 \n" + "VSHR.u64 q7, q7, #63 \n" + "VEOR d13, d14 \n" + "VMOV q8, q5 \n" + "VSHL.u64 q5, q5, #1 \n" + "VSHR.u64 q8, q8, #63 \n" + "VEOR d12, d17 \n" + "VEOR d11, d16 \n" + + /* create constant 0xc200000000000000 */ + "VMOV.i32 d16, 0xc2000000 \n" + "VSHL.u64 d16, d16, #32 \n" + + /* reduce product of multiplication */ + "VMULL.p64 q9, d10, d16 \n" + "VEOR d11, d18 \n" + "VEOR d12, d19 \n" + "VMULL.p64 q9, d11, d16 \n" + "VEOR q6, q9 \n" + "VEOR q10, q5, q6 \n" + + /* convert to GCM format */ + "VREV64.8 q10, q10 \n" + "VSWP.8 d20, d21 \n" + + "VST1.32 {q10}, [%[xOut]] \n" + + : [xOut] "=r" (X), [yOut] "=r" (Y) + : [x] "0" (X), [y] "1" (Y) + : + ); } @@ -3639,6 +4155,16 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, byte scratch[AES_BLOCK_SIZE]; ctr = counter ; + /* sanity checks */ + if (aes == NULL || (iv == NULL && ivSz > 0) || + (authTag == NULL && authTagSz > 0) || + (authIn == NULL && authInSz > 0) || + (in == NULL && sz > 0) || + (out == NULL && authTag == NULL)) { + WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); + return BAD_FUNC_ARG; + } + XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); if (ivSz == NONCE_SZ) { XMEMCPY(initialCounter, iv, ivSz); @@ -3668,7 +4194,12 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz); wc_AesEncrypt(aes, initialCounter, scratch); - xorbuf(authTag, scratch, authTagSz); + if (authTagSz > AES_BLOCK_SIZE) { + xorbuf(authTag, scratch, AES_BLOCK_SIZE); + } + else { + xorbuf(authTag, scratch, authTagSz); + } return 0; } @@ -3999,7 +4530,7 @@ int wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz, #endif /* HAVE_AESCCM */ #ifdef WOLFSSL_ASYNC_CRYPT - + /* Initialize Aes for use with Nitrox device */ int wc_AesAsyncInit(Aes* aes, int devId) { @@ -4059,6 +4590,19 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) : "cc", "memory" ); } + #else + { + word32* pt = (word32*)aes->H; + __asm__ volatile ( + "VLD1.32 {q0}, [%[h]] \n" + "VREV64.8 q0, q0 \n" + "VSWP.8 d0, d1 \n" + "VST1.32 {q0}, [%[out]] \n" + : [out] "=r" (pt) + : [h] "0" (pt) + : "cc", "memory" + ); + } #endif } diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index ef9591bd0..6e634d5be 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -79,7 +79,7 @@ static const ALIGN32 word32 K[64] = { int wc_InitSha256(Sha256* sha256) { int ret = 0; - + sha256->digest[0] = 0x6A09E667L; sha256->digest[1] = 0xBB67AE85L; sha256->digest[2] = 0x3C6EF372L; @@ -92,7 +92,7 @@ int wc_InitSha256(Sha256* sha256) sha256->buffLen = 0; sha256->loLen = 0; sha256->hiLen = 0; - + return ret; } @@ -151,135 +151,133 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) /* begining of SHA256 block operation */ "sha256Start:\n" + /* Round 1 */ "MOV v4.16b, v0.16b \n" "ADD v0.4s, v0.4s, v16.4s \n" "MOV v11.16b, v12.16b \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" + /* Round 2 */ + "SHA256SU0 v4.4s, v1.4s \n" "ADD v0.4s, v1.4s, v17.4s \n" "MOV v11.16b, v12.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" + /* Round 3 */ + "SHA256SU0 v1.4s, v2.4s \n" "ADD v0.4s, v2.4s, v18.4s \n" "MOV v11.16b, v12.16b \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v4.4s, v1.4s \n" + /* Round 4 */ + "SHA256SU0 v2.4s, v3.4s \n" "ADD v0.4s, v3.4s, v19.4s \n" "MOV v11.16b, v12.16b \n" - "MOV v5.16b, v1.16b \n" - "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v5.4s, v2.4s \n" + /* Round 5 */ + "SHA256SU0 v3.4s, v4.4s \n" "ADD v0.4s, v4.4s, v20.4s \n" "MOV v11.16b, v12.16b \n" - "MOV v6.16b, v2.16b \n" - "SHA256SU1 v5.4s, v3.4s, v4.4s \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v6.4s, v3.4s \n" - "ADD v0.4s, v5.4s, v21.4s \n" + /* Round 6 */ + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v1.4s, v21.4s \n" "MOV v11.16b, v12.16b \n" - "MOV v7.16b, v3.16b \n" - "SHA256SU1 v6.4s, v4.4s, v5.4s \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - "SHA256SU0 v7.4s, v4.4s \n" - "ADD v0.4s, v6.4s, v22.4s \n" - "MOV v11.16b, v12.16b \n" - "MOV v8.16b, v4.16b \n" - "SHA256SU1 v7.4s, v5.4s, v6.4s \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - "SHA256SU0 v8.4s, v5.4s \n" - "ADD v0.4s, v7.4s, v23.4s \n" - "MOV v11.16b, v12.16b \n" - "MOV v9.16b, v5.16b \n" - "SHA256SU1 v8.4s, v6.4s, v7.4s \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - "SHA256SU0 v9.4s, v6.4s \n" - "ADD v0.4s, v8.4s, v24.4s \n" - "MOV v11.16b, v12.16b \n" - "MOV v10.16b, v6.16b \n" - "SHA256SU1 v9.4s, v7.4s, v8.4s \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - "SHA256SU0 v10.4s, v7.4s \n" - "ADD v0.4s, v9.4s, v25.4s \n" - "MOV v11.16b, v12.16b \n" - "SHA256SU1 v10.4s, v8.4s, v9.4s \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - "ADD v0.4s, v10.4s, v26.4s \n" - "MOV v11.16b, v12.16b \n" - "SHA256H q12, q13, v0.4s \n" - "SHA256H2 q13, q11, v0.4s \n" - - /* Re-use of registers is needed in order to not overwrite - * previous digest value. */ - "#move to lower register and handle last rounds 11-15 \n" - "MOV v4.16b, v7.16b \n" - "MOV v1.16b, v8.16b \n" - "MOV v2.16b, v9.16b \n" - "MOV v3.16b, v10.16b \n" - "MOV v5.16b, v8.16b \n" - - "SHA256SU0 v4.4s, v1.4s \n" /* 4 -> 11 */ - "SHA256SU0 v5.4s, v2.4s \n" "SHA256SU1 v4.4s, v2.4s, v3.4s \n" - "SHA256SU1 v5.4s, v3.4s, v4.4s \n" - "ADD v0.4s, v4.4s, v27.4s \n" - "MOV v11.16b, v12.16b \n" - "MOV v6.16b, v2.16b \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v6.4s, v3.4s \n" - "ADD v0.4s, v5.4s, v28.4s \n" + /* Round 7 */ + "SHA256SU0 v1.4s, v2.4s \n" + "ADD v0.4s, v2.4s, v22.4s \n" "MOV v11.16b, v12.16b \n" - "MOV v7.16b, v3.16b \n" - "SHA256SU1 v6.4s, v4.4s, v5.4s \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v7.4s, v4.4s \n" - "ADD v0.4s, v6.4s, v29.4s \n" + /* Round 8 */ + "SHA256SU0 v2.4s, v3.4s \n" + "ADD v0.4s, v3.4s, v23.4s \n" "MOV v11.16b, v12.16b \n" - "MOV v8.16b, v4.16b \n" - "SHA256SU1 v7.4s, v5.4s, v6.4s \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "SHA256SU0 v8.4s, v5.4s \n" - "ADD v0.4s, v7.4s, v30.4s \n" + /* Round 9 */ + "SHA256SU0 v3.4s, v4.4s \n" + "ADD v0.4s, v4.4s, v24.4s \n" "MOV v11.16b, v12.16b \n" - "SHA256SU1 v8.4s, v6.4s, v7.4s \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" - "ADD v0.4s, v8.4s, v31.4s \n" + /* Round 10 */ + "SHA256SU0 v4.4s, v1.4s \n" + "ADD v0.4s, v1.4s, v25.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v4.4s, v2.4s, v3.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 11 */ + "SHA256SU0 v1.4s, v2.4s \n" + "ADD v0.4s, v2.4s, v26.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v1.4s, v3.4s, v4.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 12 */ + "SHA256SU0 v2.4s, v3.4s \n" + "ADD v0.4s, v3.4s, v27.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v2.4s, v4.4s, v1.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 13 */ + "SHA256SU0 v3.4s, v4.4s \n" + "ADD v0.4s, v4.4s, v28.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256SU1 v3.4s, v1.4s, v2.4s \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 14 */ + "ADD v0.4s, v1.4s, v29.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 15 */ + "ADD v0.4s, v2.4s, v30.4s \n" + "MOV v11.16b, v12.16b \n" + "SHA256H q12, q13, v0.4s \n" + "SHA256H2 q13, q11, v0.4s \n" + + /* Round 16 */ + "ADD v0.4s, v3.4s, v31.4s \n" "MOV v11.16b, v12.16b \n" "SHA256H q12, q13, v0.4s \n" "SHA256H2 q13, q11, v0.4s \n" "#Add working vars back into digest state \n" + "SUB w8, w8, #1 \n" "ADD v12.4s, v12.4s, v14.4s \n" "ADD v13.4s, v13.4s, v15.4s \n" "#check if more blocks should be done\n" - "SUB w8, w8, #1 \n" "CBZ w8, sha256End \n" "#load in message and schedual updates \n" @@ -301,7 +299,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) [blocks] "2" (numBlocks), [dataIn] "3" (data) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15" + "v15", "w8" ); AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks); @@ -691,20 +689,17 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) add = (len + sha256->buffLen) - numBlocks * SHA256_BLOCK_SIZE; __asm__ volatile ( "#load leftover data\n" - "VLD1.32 {q0}, [%[buffer]]! \n" - "VLD1.32 {q1}, [%[buffer]]! \n" - "VLD1.32 {q2}, [%[buffer]]! \n" - "VLD1.32 {q3}, [%[buffer]] \n" + "VLDM %[buffer]!, {q0-q3} \n" "#load current digest\n" - "VLD1.32 {q12}, [%[digest]]! \n" - "VLD1.32 {q13}, [%[digest]] \n" - "SUB %[digest], %[digest], #16 \n" + "VLDM %[digest], {q12-q13} \n" "MOV r8, %r[blocks] \n" "VREV32.8 q0, q0 \n" "VREV32.8 q1, q1 \n" "VREV32.8 q2, q2 \n" "VREV32.8 q3, q3 \n" + "VLDM %[k]! ,{q5-q8} \n" + "VLDM %[k]! ,{q9}\n" "VMOV.32 q14, q12 \n" /* store digest for add at the end */ "VMOV.32 q15, q13 \n" @@ -713,151 +708,135 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) "sha256Start:\n" /* Round 1 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VMOV.32 q4, q0 \n" - "VADD.i32 q0, q0, q5 \n" - "VMOV.32 q11, q12 \n" + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 2 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q6 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 3 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q7 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 4 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q6, q1 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q8 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 5 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q7, q2 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q9 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 6 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q8, q3 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 7 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q8, q4 \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV q9, q4 \n" - "SHA256SU1.32 q8, q6, q7 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 8 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q9, q6 \n" - "VADD.i32 q0, q8, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q10, q6 \n" - "SHA256SU1.32 q9, q7, q8 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 9 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q10, q7 \n" - "VMOV.32 q1, q7 \n" - "VADD.i32 q0, q9, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q10, q8, q9 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 10 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q1, q8 \n" - "VMOV.32 q2, q8 \n" - "VADD.i32 q0, q10, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q1, q9, q10 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 11 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q2, q9 \n" - "VMOV.32 q3, q9 \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q2, q10, q1 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 12 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q3, q10 \n" - "VMOV.32 q4, q10 \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q3, q1, q2 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 13 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VMOV.32 q6, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q10 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 14 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VMOV.32 q7, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "VADD.i32 q0, q1, q10 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 15 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q10}, [%[k]]! \n" + "VADD.i32 q0, q2, q10 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 16 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q10}, [%[k]] \n" + "SUB r8, r8, #1 \n" + "VADD.i32 q0, q3, q10 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" @@ -866,7 +845,6 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) "VADD.i32 q13, q13, q15 \n" "#check if more blocks should be done\n" - "SUB r8, r8, #1 \n" "CMP r8, #0 \n" "BEQ sha256End \n" @@ -877,7 +855,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) "VLD1.32 {q3}, [%[dataIn]]! \n" /* reset K pointer */ - "SUB %[k], %[k], #256 \n" + "SUB %[k], %[k], #160 \n" "VREV32.8 q0, q0 \n" "VREV32.8 q1, q1 \n" "VREV32.8 q2, q2 \n" @@ -895,7 +873,7 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) [blocks] "2" (numBlocks), [dataIn] "3" (data) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", - "q15" + "q15", "r8" ); AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks); @@ -930,15 +908,10 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; __asm__ volatile ( "#load leftover data\n" - "VLD1.32 {q0}, [%[buffer]]! \n" - "VLD1.32 {q1}, [%[buffer]]! \n" - "VLD1.32 {q2}, [%[buffer]]! \n" - "VLD1.32 {q3}, [%[buffer]] \n" + "VLDM %[buffer]!, {q0-q3} \n" "#load current digest\n" - "VLD1.32 {q12}, [%[digest]]! \n" - "VLD1.32 {q13}, [%[digest]] \n" - "SUB %[digest], %[digest], #16 \n" + "VLDM %[digest], {q12-q13} \n" "VREV32.8 q0, q0 \n" "VREV32.8 q1, q1 \n" "VREV32.8 q2, q2 \n" @@ -949,153 +922,140 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) "VMOV.32 q15, q13 \n" /* begining of SHA256 block operation */ - /* Round 1 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VMOV.32 q4, q0 \n" - "VADD.i32 q0, q0, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 2 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 3 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 4 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q6, q1 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 5 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q7, q2 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 6 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q8, q3 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 7 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q8, q4 \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV q9, q4 \n" - "SHA256SU1.32 q8, q6, q7 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 8 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q9, q6 \n" - "VADD.i32 q0, q8, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q10, q6 \n" - "SHA256SU1.32 q9, q7, q8 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 9 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q10, q7 \n" - "VMOV.32 q1, q7 \n" - "VADD.i32 q0, q9, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q10, q8, q9 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 10 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q1, q8 \n" - "VMOV.32 q2, q8 \n" - "VADD.i32 q0, q10, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q1, q9, q10 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 11 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q2, q9 \n" - "VMOV.32 q3, q9 \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q2, q10, q1 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 12 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q3, q10 \n" - "VMOV.32 q4, q10 \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q3, q1, q2 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 13 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VMOV.32 q6, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 14 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VMOV.32 q7, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 15 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 16 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" @@ -1153,167 +1113,149 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) word32* digPt = sha256->digest; __asm__ volatile ( "#load leftover data\n" - "VLD1.32 {q0}, [%[buffer]]! \n" - "VLD1.32 {q1}, [%[buffer]]! \n" - "VLD1.32 {q2}, [%[buffer]]! \n" - "VLD1.32 {q3}, [%[buffer]] \n" + "VLDM %[buffer]!, {q0-q3} \n" "#load current digest\n" - "VLD1.32 {q12}, [%[digest]]! \n" - "VLD1.32 {q13}, [%[digest]] \n" - "SUB %[digest], %[digest], #16 \n" + "VLDM %[digest], {q12-q13} \n" "VMOV.32 q14, q12 \n" /* store digest for add at the end */ "VMOV.32 q15, q13 \n" /* begining of SHA256 block operation */ - /* Round 1 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VMOV.32 q4, q0 \n" - "VADD.i32 q0, q0, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VMOV.32 q4, q0 \n" + "VADD.i32 q0, q0, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 2 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 3 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 4 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q6, q1 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 5 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q7, q2 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 6 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q8, q3 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 7 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q8, q4 \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV q9, q4 \n" - "SHA256SU1.32 q8, q6, q7 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 8 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q9, q6 \n" - "VADD.i32 q0, q8, q5 \n" - "VMOV.32 q11, q12 \n" - "VMOV.32 q10, q6 \n" - "SHA256SU1.32 q9, q7, q8 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 9 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q10, q7 \n" - "VMOV.32 q1, q7 \n" - "VADD.i32 q0, q9, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q10, q8, q9 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 10 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q1, q8 \n" - "VMOV.32 q2, q8 \n" - "VADD.i32 q0, q10, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q1, q9, q10 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q4, q1 \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q4, q2, q3 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 11 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q2, q9 \n" - "VMOV.32 q3, q9 \n" - "VADD.i32 q0, q1, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q2, q10, q1 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q1, q2 \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q1, q3, q4 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 12 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q3, q10 \n" - "VMOV.32 q4, q10 \n" - "VADD.i32 q0, q2, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q3, q1, q2 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q2, q3 \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q2, q4, q1 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 13 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q4, q1 \n" - "VMOV.32 q6, q1 \n" - "VADD.i32 q0, q3, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q4, q2, q3 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "SHA256SU0.32 q3, q4 \n" + "VADD.i32 q0, q4, q5 \n" + "VMOV.32 q11, q12 \n" + "SHA256SU1.32 q3, q1, q2 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 14 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q6, q2 \n" - "VMOV.32 q7, q2 \n" - "VADD.i32 q0, q4, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q6, q3, q4 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q1, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 15 */ - "VLD1.32 {q5}, [%[k]]! \n" - "SHA256SU0.32 q7, q3 \n" - "VADD.i32 q0, q6, q5 \n" - "VMOV.32 q11, q12 \n" - "SHA256SU1.32 q7, q4, q6 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q2, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n" /* Round 16 */ - "VLD1.32 {q5}, [%[k]]! \n" - "VADD.i32 q0, q7, q5 \n" - "VMOV.32 q11, q12 \n" + "VLD1.32 {q5}, [%[k]]! \n" + "VADD.i32 q0, q3, q5 \n" + "VMOV.32 q11, q12 \n" "SHA256H.32 q12, q13, q0 \n" "SHA256H2.32 q13, q11, q0 \n"