From 2cbc6ed673c1830d777b407c01f7f0a7360fd9c3 Mon Sep 17 00:00:00 2001 From: Jacob Barthelmeh Date: Wed, 23 Nov 2016 15:44:53 -0700 Subject: [PATCH 1/2] ARMv8 : handle aggressive optimizers --- wolfcrypt/src/port/arm/armv8-aes.c | 37 +++++++++++++++------------ wolfcrypt/src/port/arm/armv8-sha256.c | 12 ++++++--- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index c95334219..13e106159 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -527,8 +527,8 @@ int wc_InitAes_h(Aes* aes, void* h) "#store current counter value at the end \n" "ST1 {v0.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" @@ -584,8 +584,8 @@ int wc_InitAes_h(Aes* aes, void* h) "ST1 {v0.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" @@ -646,8 +646,8 @@ int wc_InitAes_h(Aes* aes, void* h) "ST1 {v0.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", @@ -720,8 +720,8 @@ int wc_InitAes_h(Aes* aes, void* h) "#store current counter value at the end \n" "ST1 {v13.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" @@ -778,8 +778,8 @@ int wc_InitAes_h(Aes* aes, void* h) "#store current counter value at the end \n" "ST1 {v15.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" @@ -840,8 +840,8 @@ int wc_InitAes_h(Aes* aes, void* h) "#store current counter value at the end \n" "ST1 {v17.2d}, %[regOut] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg) - :"0" (out), [Key] "m" (aes->key), [input] "r" (in), + :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) + :"0" (out), [Key] "m" (aes->key), [input] "2" (in), [blocks] "r" (numBlocks), [reg] "m" (aes->reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", @@ -2039,7 +2039,8 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, ,[inX] "4" (xPt), [inY] "m" (aes->H) : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" + ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -2473,7 +2474,8 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, [ctr] "2" (iCtr) , [h] "m" (aes->H) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", - "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24" + "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" ); @@ -4061,7 +4063,8 @@ static void GMULT(byte* X, byte* Y) : [xOut] "=r" (X), [yOut] "=r" (Y) : [x] "0" (X), [y] "1" (Y) - : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6" ,"q7", "q8", + "q9", "q10", "q11" ,"q12", "q13", "q14", "q15" ); } @@ -4597,7 +4600,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) "ST1 {v0.16b}, [%[out]] \n" : [out] "=r" (pt) : [h] "0" (pt) - : "cc", "memory" + : "cc", "memory", "v0" ); } #else @@ -4610,7 +4613,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) "VST1.32 {q0}, [%[out]] \n" : [out] "=r" (pt) : [h] "0" (pt) - : "cc", "memory" + : "cc", "memory", "q0" ); } #endif diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index cceb8c865..06bc6897c 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -305,7 +305,9 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) [blocks] "2" (numBlocks), [dataIn] "3" (data) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "w8" + "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31", "w8" ); AddLength(sha256, SHA256_BLOCK_SIZE * numBlocks); @@ -488,6 +490,7 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) [buffer] "m" (sha256->buffer) : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11" , "v12", "v13", "v14", "v15", "v16", "v17", "v18" + , "v19", "v20", "v21", "v22", "v23", "v24", "v25" ); sha256->buffLen = 0; @@ -510,7 +513,7 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) "ST1 {v0.2d-v3.2d}, %[out] \n" : [out] "=m" (sha256->buffer) : [in] "m" (sha256->buffer) - : "cc", "memory" + : "cc", "memory", "v0", "v1", "v2", "v3" ); #endif /* ! length ordering dependent on digest endian type ! */ @@ -666,7 +669,8 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) "0" (hash) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17", "v18" + "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25" ); return wc_InitSha256(sha256); /* reset state */ @@ -1119,7 +1123,7 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) "VST1.32 {q3}, [%[out]] \n" : [out] "=r" (bufPt) : [in] "0" (bufPt) - : "cc", "memory" + : "cc", "memory", "q0", "q1", "q2", "q3" ); #endif /* ! length ordering dependent on digest endian type ! */ From 944e5fba03582d5ac60ce5c044a8daeb27dfe1bf Mon Sep 17 00:00:00 2001 From: Jacob Barthelmeh Date: Tue, 6 Dec 2016 21:42:15 +0000 Subject: [PATCH 2/2] ARMv8 : load pointer to AES key and counter into a register along with pointer to SHA256 K table to handle tight optimized loops on function call with -flto --- wolfcrypt/src/port/arm/armv8-aes.c | 73 ++++++++++++++------------- wolfcrypt/src/port/arm/armv8-sha256.c | 6 ++- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 13e106159..e59bd2571 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -476,6 +476,8 @@ int wc_InitAes_h(Aes* aes, void* h) /* do as many block size ops as possible */ if (numBlocks > 0) { + word32* key = aes->key; + word32* reg = aes->reg; /* AESE exor's input with round key shift rows of exor'ed result @@ -487,10 +489,10 @@ int wc_InitAes_h(Aes* aes, void* h) case 10: /* AES 128 BLOCK */ __asm__ __volatile__ ( "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v11.2d},%[Key], #48 \n" - "LD1 {v0.2d}, %[reg] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" + "LD1 {v0.2d}, [%[reg]] \n" "LD1 {v12.2d}, [%[input]], #16 \n" "1:\n" @@ -525,11 +527,11 @@ int wc_InitAes_h(Aes* aes, void* h) "2:\n" "#store current counter value at the end \n" - "ST1 {v0.2d}, %[regOut] \n" + "ST1 {v0.2d}, [%[regOut]] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" ); @@ -675,14 +677,17 @@ int wc_InitAes_h(Aes* aes, void* h) /* do as many block size ops as possible */ if (numBlocks > 0) { + word32* key = aes->key; + word32* reg = aes->reg; + switch(aes->rounds) { case 10: /* AES 128 BLOCK */ __asm__ __volatile__ ( "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v11.2d},%[Key], #48 \n" - "LD1 {v13.2d}, %[reg] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d},[%[Key]], #48 \n" + "LD1 {v13.2d}, [%[reg]] \n" "1:\n" "LD1 {v0.2d}, [%[input]], #16 \n" @@ -718,11 +723,11 @@ int wc_InitAes_h(Aes* aes, void* h) "2: \n" "#store current counter value at the end \n" - "ST1 {v13.2d}, %[regOut] \n" + "ST1 {v13.2d}, [%[regOut]] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13" ); @@ -731,11 +736,11 @@ int wc_InitAes_h(Aes* aes, void* h) case 12: /* AES 192 BLOCK */ __asm__ __volatile__ ( "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v12.2d},%[Key], #64 \n" - "LD1 {v13.16b}, %[Key], #16 \n" - "LD1 {v15.2d}, %[reg] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v12.2d},[%[Key]], #64 \n" + "LD1 {v13.16b}, [%[Key]], #16 \n" + "LD1 {v15.2d}, [%[reg]] \n" "LD1 {v0.2d}, [%[input]], #16 \n" "1: \n" @@ -776,11 +781,11 @@ int wc_InitAes_h(Aes* aes, void* h) "2:\n" "#store current counter value at the end \n" - "ST1 {v15.2d}, %[regOut] \n" + "ST1 {v15.2d}, [%[regOut]] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" ); @@ -789,11 +794,11 @@ int wc_InitAes_h(Aes* aes, void* h) case 14: /* AES 256 BLOCK */ __asm__ __volatile__ ( "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, %[Key], #64 \n" - "LD1 {v5.2d-v8.2d}, %[Key], #64 \n" - "LD1 {v9.2d-v12.2d}, %[Key], #64 \n" - "LD1 {v13.2d-v15.2d}, %[Key], #48 \n" - "LD1 {v17.2d}, %[reg] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" + "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" + "LD1 {v17.2d}, [%[reg]] \n" "LD1 {v0.2d}, [%[input]], #16 \n" "1: \n" @@ -838,11 +843,11 @@ int wc_InitAes_h(Aes* aes, void* h) "2:\n" "#store current counter value at the end \n" - "ST1 {v17.2d}, %[regOut] \n" + "ST1 {v17.2d}, [%[regOut]] \n" - :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in) - :"0" (out), [Key] "m" (aes->key), [input] "2" (in), - [blocks] "r" (numBlocks), [reg] "m" (aes->reg) + :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in) + :"0" (out), [Key] "r" (key), [input] "2" (in), + [blocks] "r" (numBlocks), [reg] "1" (reg) : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15", "v16", "v17" diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c index 06bc6897c..fdf2634bf 100644 --- a/wolfcrypt/src/port/arm/armv8-sha256.c +++ b/wolfcrypt/src/port/arm/armv8-sha256.c @@ -133,6 +133,8 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) numBlocks = (len + sha256->buffLen)/SHA256_BLOCK_SIZE; if (numBlocks > 0) { + word32* k = (word32*)K; + /* get leftover amount after blocks */ add = (len + sha256->buffLen) - numBlocks * SHA256_BLOCK_SIZE; __asm__ volatile ( @@ -300,8 +302,8 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) "STP q12, q13, %[out] \n" : [out] "=m" (sha256->digest), "=m" (sha256->buffer), "=r" (numBlocks), - "=r" (data) - : [k] "r" (K), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer), + "=r" (data), "=r" (k) + : [k] "4" (k), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer), [blocks] "2" (numBlocks), [dataIn] "3" (data) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",