diff --git a/wolfcrypt/src/random.c b/wolfcrypt/src/random.c old mode 100644 new mode 100755 index 24206b3b8..a93d9416c --- a/wolfcrypt/src/random.c +++ b/wolfcrypt/src/random.c @@ -22,7 +22,7 @@ #ifdef HAVE_CONFIG_H #include #endif - + #include /* on HPUX 11 you may need to install /dev/random see @@ -570,6 +570,83 @@ int wc_RNG_HealthTest(int reseed, const byte* entropyA, word32 entropyASz, return 0; } +#elif defined(HAVE_INTEL_RDGEN) + +#ifndef _MSC_VER + #define cpuid(reg, leaf, sub)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (leaf), "c"(sub)); + + #define XASM_LINK(f) asm(f) +#else + + #include + #define cpuid(a,b) __cpuid((int*)a,b) + + #define XASM_LINK(f) + +#endif /* _MSC_VER */ + +#define EAX 0 +#define EBX 1 +#define ECX 2 +#define EDX 3 + +#define CPUID_AVX1 0x1 +#define CPUID_AVX2 0x2 +#define CPUID_RDRAND 0x4 +#define CPUID_RDSEED 0x8 + +#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) +#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) + +static word32 cpuid_flags = 0 ; + +static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { + int got_intel_cpu=0; + unsigned int reg[5]; + + reg[4] = '\0' ; + cpuid(reg, 0, 0); + if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && + memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && + memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { + got_intel_cpu = 1; + } + if (got_intel_cpu) { + cpuid(reg, leaf, sub); + return((reg[num]>>bit)&0x1) ; + } + return 0 ; +} + +static int set_cpuid_flags(void) { + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;} + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;} + + if(cpuid_flags == 0)return 1 ; + else return 0 ; +} + +int wc_InitRng(RNG* rng) +{ + (void) rng ; + return set_cpuid_flags() ; +} + +int wc_RNG_GenerateBlock(RNG* rng, byte* output, word32 sz) +{ + (void) rng ; + return wc_GenerateSeed(NULL, output, sz) ; +} + + +int wc_RNG_GenerateByte(RNG* rng, byte* b) +{ + (void) rng ; + return wc_GenerateSeed(NULL, b, 1) ; +} #else /* HAVE_HASHDRBG || NO_RC4 */ @@ -978,6 +1055,56 @@ int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) return 0; } +#elif defined(HAVE_INTEL_RDGEN) + +static inline int IntelRNrand32(unsigned int *rnd) +{ + int rdrand; + + __asm__ volatile("rdrand %0":"=r"(rdrand)); + if(rdrand){ + *rnd = rdrand ; + return 0 ; + } else + return 1; +} + + +static inline int IntelRNseed32(unsigned int *seed) +{ + int rdseed; + + __asm__ volatile("rdseed %0":"=r"(rdseed)); + if(rdseed){ + *seed = rdseed ; + return 0 ; + } else + return 1; +} + +int wc_GenerateSeed(OS_Seed* os, byte* output, word32 sz) +{ + (void) os ; + int ret ; byte buff[4] ; + + for( ; sz/4 > 0; sz-=4, output+=4) { + if (IS_INTEL_RDSEED)ret = IntelRNseed32((word32 *)output) ; + else if(IS_INTEL_RDRAND)ret = IntelRNrand32((word32 *)output); + else return 1 ; + if(ret) + return 1 ; + } + if(sz == 0)return 0 ; + + if (IS_INTEL_RDSEED)ret = IntelRNseed32((word32 *)buff) ; + else if(IS_INTEL_RDRAND)ret = IntelRNrand32((word32 *)buff); + else return 1 ; + if(ret) + return 1 ; + XMEMCPY(output, buff, sz) ; + return 0; +} + #elif defined(CUSTOM_RAND_GENERATE) /* Implement your own random generation function diff --git a/wolfcrypt/src/sha256.c b/wolfcrypt/src/sha256.c old mode 100644 new mode 100755 index 13a196e77..f25679e76 --- a/wolfcrypt/src/sha256.c +++ b/wolfcrypt/src/sha256.c @@ -19,7 +19,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ - /* code submitted by raphael.huck@efixo.com */ #ifdef HAVE_CONFIG_H @@ -66,6 +65,185 @@ int wc_Sha256Hash(const byte* data, word32 len, byte* out) #define FIPS_NO_WRAPPERS #endif +#if defined(USE_INTEL_SPEEDUP) +#define HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX2 +#endif + +#if defined(HAVE_INTEL_AVX2) +#define HAVE_INTEL_RORX +#endif + + +/***** +Intel AVX1/AVX2 Macro Control Structure + +#define HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX2 + +#define HAVE_INTEL_RORX + + +int InitSha256(Sha256* sha256) { + Save/Recover XMM, YMM + ... +} + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + Transform() ; Function prototype +#else + Transform() { } + int Sha256Final() { + Save/Recover XMM, YMM + ... + } +#endif + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + #if defined(HAVE_INTEL_RORX + #define RND with rorx instuction + #else + #define RND + #endif +#endif + +#if defined(HAVE_INTEL_AVX1) + + #define XMM Instructions/inline asm + + int Transform() { + Stitched Message Sched/Round + } + +#elif defined(HAVE_INTEL_AVX2) + + #define YMM Instructions/inline asm + + int Transform() { + More granural Stitched Message Sched/Round + } + +*/ + + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + +/* Each platform needs to query info type 1 from cpuid to see if aesni is + * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts + */ + +#ifndef _MSC_VER + #define cpuid(reg, leaf, sub)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (leaf), "c"(sub)); + + #define XASM_LINK(f) asm(f) +#else + + #include + #define cpuid(a,b) __cpuid((int*)a,b) + + #define XASM_LINK(f) + +#endif /* _MSC_VER */ + +#define EAX 0 +#define EBX 1 +#define ECX 2 +#define EDX 3 + +#define CPUID_AVX1 0x1 +#define CPUID_AVX2 0x2 +#define CPUID_RDRAND 0x4 +#define CPUID_RDSEED 0x8 + +#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) +#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) +#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) +#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) + +static word32 cpuid_flags = 0 ; + +static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { + int got_intel_cpu=0; + unsigned int reg[5]; + + reg[4] = '\0' ; + cpuid(reg, 0, 0); + if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && + memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && + memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { + got_intel_cpu = 1; + } + if (got_intel_cpu) { + cpuid(reg, leaf, sub); + return((reg[num]>>bit)&0x1) ; + } + return 0 ; +} + +static void set_cpuid_flags(void) { + if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ; } + if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;} + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;} +} + +/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ +static int Transform(Sha256* sha256); + +#if defined(HAVE_INTEL_AVX1) +static int Transform_AVX1(Sha256 *sha256) ; +#endif +#if defined(HAVE_INTEL_AVX2) +static int Transform_AVX2(Sha256 *sha256) ; +static int Transform_AVX1_RORX(Sha256 *sha256) ; +#endif + +static int (*Transform_p)(Sha256* sha256) /* = _Transform */; + +#define XTRANSFORM(sha256, B) (*Transform_p)(sha256) + +static void set_Transform(void) { + set_cpuid_flags() ; + +#if defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX2){ Transform_p = Transform_AVX1_RORX; return ; } + Transform_p = Transform_AVX2 ; /* for avoiding warning,"not used" */ +#endif +#if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; +#endif + Transform_p = Transform ; +} + +#else + #if defined(FREESCALE_MMCAU) + #define XTRANSFORM(sha256, B) Transform(sha256, B) + #else + #define XTRANSFORM(sha256, B) Transform(sha256) + #endif +#endif + +/* Dummy for saving MM_REGs on behalf of Transform */ +#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) +#define SAVE_XMM_YMM __asm__ volatile("vpxor %%ymm7, %%ymm7, %%ymm7":::\ + "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") +#elif defined(HAVE_INTEL_AVX1) +#define SAVE_XMM_YMM __asm__ volatile("vpxor %%xmm7, %%xmm7, %%xmm7":::\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ + "xmm11","xmm12","xmm13","xmm14","xmm15") +#else +#define SAVE_XMM_YMM +#endif + +#ifdef WOLFSSL_PIC32MZ_HASH +#define InitSha256 InitSha256_sw +#define Sha256Update Sha256Update_sw +#define Sha256Final Sha256Final_sw +#endif + #include #include @@ -107,24 +285,17 @@ int wc_InitSha256(Sha256* sha256) sha256->buffLen = 0; sha256->loLen = 0; sha256->hiLen = 0; + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + set_Transform() ; /* choose best Transform function under this runtime environment */ +#endif return 0; } -#ifdef FREESCALE_MMCAU - #define XTRANSFORM(S,B) Transform((S), (B)) -static int Transform(Sha256* sha256, byte* buf) -{ - cau_sha256_hash_n(buf, 1, sha256->digest); - - return 0; -} - -#else - #define XTRANSFORM(S,B) Transform((S)) - -static const word32 K[64] = { +#if !defined(FREESCALE_MMCAU) +static const __attribute__((aligned(32))) word32 K[64] = { 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, @@ -140,14 +311,54 @@ static const word32 K[64] = { 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L }; +#endif + +#if defined(HAVE_INTEL_RORX) +#define ROTR(func, bits, x) \ +word32 func(word32 x) { word32 ret ;\ + __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\ + return ret ;\ +} + +static INLINE ROTR(rotrFixed_2, 2, x) +static INLINE ROTR(rotrFixed_13, 13, x) +static INLINE ROTR(rotrFixed_22, 22, x) +static INLINE ROTR(rotrFixed_6, 6, x) +static INLINE ROTR(rotrFixed_11, 11, x) +static INLINE ROTR(rotrFixed_25, 25, x) +static INLINE ROTR(rotrFixed_7, 7, x) +static INLINE ROTR(rotrFixed_18, 18, x) +static INLINE ROTR(rotrFixed_17, 17, x) +static INLINE ROTR(rotrFixed_19, 19, x) +#endif + +#if defined(FREESCALE_MMCAU) + +static int Transform(Sha256* sha256, byte* buf) +{ + cau_sha256_hash_n(buf, 1, sha256->digest); + + return 0; +} + +#endif /* FREESCALE_MMCAU */ + #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) -#define S(x, n) rotrFixed(x, n) #define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) + +#if !defined(HAVE_INTEL_RORX) +#define S(x, n) rotrFixed(x, n) #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) +#else +#define Sigma0(x) (rotrFixed_2(x) ^ rotrFixed_13(x) ^ rotrFixed_22(x)) +#define Sigma1(x) (rotrFixed_6(x) ^ rotrFixed_11(x) ^ rotrFixed_25(x)) +#define Gamma0(x) (rotrFixed_7(x) ^ rotrFixed_18(x) ^ R(x, 3)) +#define Gamma1(x) (rotrFixed_17(x) ^ rotrFixed_19(x) ^ R(x, 10)) +#endif #define RND(a,b,c,d,e,f,g,h,i) \ t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ @@ -155,7 +366,7 @@ static const word32 K[64] = { (d) += t0; \ (h) = t0 + t1; - +#if !defined(FREESCALE_MMCAU) static int Transform(Sha256* sha256) { word32 S[8], t0, t1; @@ -204,8 +415,7 @@ static int Transform(Sha256* sha256) return 0; } -#endif /* FREESCALE_MMCAU */ - +#endif /* #if !defined(FREESCALE_MMCAU) */ static INLINE void AddLength(Sha256* sha256, word32 len) { @@ -214,12 +424,14 @@ static INLINE void AddLength(Sha256* sha256, word32 len) sha256->hiLen++; /* carry low to high */ } - int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) { + /* do block size increments */ byte* local = (byte*)sha256->buffer; + SAVE_XMM_YMM ; /* for Intel AVX */ + while (len) { word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); XMEMCPY(&local[sha256->buffLen], data, add); @@ -231,11 +443,13 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) if (sha256->buffLen == SHA256_BLOCK_SIZE) { int ret; - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); #endif - ret = XTRANSFORM(sha256, local); if (ret != 0) return ret; @@ -248,11 +462,12 @@ int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) return 0; } - int wc_Sha256Final(Sha256* sha256, byte* hash) { byte* local = (byte*)sha256->buffer; int ret; + + SAVE_XMM_YMM ; /* for Intel AVX */ AddLength(sha256, sha256->buffLen); /* before adding pads */ @@ -263,7 +478,10 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen); sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); #endif @@ -281,16 +499,22 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) sha256->loLen = sha256->loLen << 3; /* store lengths */ - #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) - ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); #endif /* ! length ordering dependent on digest endian type ! */ XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, sizeof(word32)); - - #ifdef FREESCALE_MMCAU + + #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) /* Kinetis requires only these bytes reversed */ + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX1 || IS_INTEL_AVX2) + #endif ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], 2 * sizeof(word32)); @@ -300,7 +524,7 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) if (ret != 0) return ret; - #ifdef LITTLE_ENDIAN_ORDER + #if defined(LITTLE_ENDIAN_ORDER) ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); #endif XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); @@ -309,6 +533,7 @@ int wc_Sha256Final(Sha256* sha256, byte* hash) } + int wc_Sha256Hash(const byte* data, word32 len, byte* hash) { int ret = 0; @@ -341,7 +566,1253 @@ int wc_Sha256Hash(const byte* data, word32 len, byte* hash) return ret; } +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + +#define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ;\ + d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\ + d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\ + d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\ + d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\ + d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\ + d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\ + d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\ + d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\ +} + +#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ; \ + __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\ +} + + +#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + + + + +#define S_0 %r15d +#define S_1 %r10d +#define S_2 %r11d +#define S_3 %r12d +#define S_4 %r13d +#define S_5 %r14d +#define S_6 %ebx +#define S_7 %r9d + +#define SSE_REGs "%esi", "%r8", "%edx", "%ebx","%r9","%r10","%r11","%r12","%r13","%r14","%r15" + +#if defined(HAVE_INTEL_RORX) +#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ + +#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ +__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ +__asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ + +#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ +/*__asm__ volatile("movl %%edx, %0\n\t":"=m"(s1)::SSE_REGs); DEBUG */ \ +__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ + +#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\ +/*__asm__ volatile("movl %%esi, %0\n\t":"=m"(Ch)); DEBUG*/ \ +/*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ +__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ +__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ +__asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ +__asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\ + +#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ +__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\ +/*__asm__ volatile("movl %%edx, %0\n\t":"=m"(s0)); DEBUG */\ + +#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ +__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ +__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\ +__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ +/*__asm__ volatile("movl %%esi, %0\n\t":"=m"(esi)); DEBUG */\ + +#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ +__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ +__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ +/*__asm__ volatile("movl %%r8d, %0\n\t":"=m"(Maj)); DEBUG */\ +/*__asm__ volatile("movl %0, %%esi\n\t"::"m"(esi)); DEBUG */\ +/*__asm__ volatile("movl %"#h", %0\n\t":"=m"(t0)); DEBUG */\ + +#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ +/*__asm__ volatile("movl %"#d", %0\n\t":"=m"(h1)); DEBUG(h1)*/ \ +__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ + /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ +__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \ + /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)*/\ +__asm__ volatile("movl %r8d, "#h"\n\t"); + /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + +#endif + +#define RND_STEP_1(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\ +__asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ +__asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\ + +#define RND_STEP_2(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ +__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ +__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\ +__asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ + +#define RND_STEP_3(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ +/*__asm__ volatile("movl %%edx, %0\n\t":"=m"(s1)::SSE_REGs); DEBUG */ \ +__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ +__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ + +#define RND_STEP_4(a,b,c,d,e,f,g,h,i)\ +/*__asm__ volatile("movl %%esi, %0\n\t":"=m"(Ch)); DEBUG*/ \ +/*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ +__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ +__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ +__asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\ +__asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ +__asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\ +__asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\ +__asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\ + +#define RND_STEP_5(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ +__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\ +__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\ + +#define RND_STEP_6(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ +__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ +__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\ +__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ + +#define RND_STEP_7(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ +__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ +__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ + +#define RND_STEP_8(a,b,c,d,e,f,g,h,i)\ +__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ +__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ + /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ +__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ + /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ +__asm__ volatile("movl %r8d, "#h"\n\t"); \ + /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ + +#define RND_X(a,b,c,d,e,f,g,h,i) \ + RND_STEP_1(a,b,c,d,e,f,g,h,i); \ + RND_STEP_2(a,b,c,d,e,f,g,h,i); \ + RND_STEP_3(a,b,c,d,e,f,g,h,i); \ + RND_STEP_4(a,b,c,d,e,f,g,h,i); \ + RND_STEP_5(a,b,c,d,e,f,g,h,i); \ + RND_STEP_6(a,b,c,d,e,f,g,h,i); \ + RND_STEP_7(a,b,c,d,e,f,g,h,i); \ + RND_STEP_8(a,b,c,d,e,f,g,h,i); + +#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + + +#define RND_1_3(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_1(a,b,c,d,e,f,g,h,i); \ + RND_STEP_2(a,b,c,d,e,f,g,h,i); \ + RND_STEP_3(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_4_6(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_4(a,b,c,d,e,f,g,h,i); \ + RND_STEP_5(a,b,c,d,e,f,g,h,i); \ + RND_STEP_6(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_7_8(a,b,c,d,e,f,g,h,i) {\ + RND_STEP_7(a,b,c,d,e,f,g,h,i); \ + RND_STEP_8(a,b,c,d,e,f,g,h,i); \ +} + +#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + + +#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); +#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); +#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); +#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); +#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); +#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); +#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); +#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); + +#define FOR(cnt, init, max, inc, loop) \ + __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) +#define END(cnt, init, max, inc, loop) \ + __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ; + +#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ + +#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ + +#define voitle +#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) +#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) +#undef voitle + +#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ + a,b,c,d,e,f,g,h,_i)\ + RND_STEP_1(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP0, X3, X2, 4) ;\ + RND_STEP_2(a,b,c,d,e,f,g,h,_i);\ + VPADDD (XTMP0, XTMP0, X0) ;\ + RND_STEP_3(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ + RND_STEP_4(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1, 7) ;\ + RND_STEP_5(a,b,c,d,e,f,g,h,_i);\ + VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ + RND_STEP_6(a,b,c,d,e,f,g,h,_i);\ + VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ + RND_STEP_7(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1,18) ;\ + RND_STEP_8(a,b,c,d,e,f,g,h,_i);\ +\ + RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\ + VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ + RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\ + VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ + RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP1) ;\ + RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ + RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ + RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\ + VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ + RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\ + VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ + RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\ +\ + RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ + RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ + RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ + RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ + RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ + RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\ + VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ + RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\ +\ + RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ + RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ + RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ + RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ + RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ + RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ + RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\ + VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ + +#if defined(HAVE_INTEL_RORX) + +#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ + XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ + RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP0, X3, X2, 4) ;\ + RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\ + VPADDD (XTMP0, XTMP0, X0) ;\ + RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\ + VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ + RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1, 7) ;\ + RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\ + VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ + RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\ + VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ + RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\ + VPSRLD (XTMP2, XTMP1,18) ;\ + RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\ +\ + RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\ + VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ + RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\ + VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ + RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP1) ;\ + RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ + RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\ + VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ + RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\ + VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ + RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\ + VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ + RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\ +\ + RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ + RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ + RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ + RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\ + VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ + RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\ + VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ + RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\ + VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ + RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\ +\ + RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ + RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ + RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ + RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\ + VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ + RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP2, XTMP2, XTMP3) ;\ + RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\ + VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ + RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\ + VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ + RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\ + VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ + +#endif + + +#define W_K_from_buff\ + {/* X0..3(xmm4..7) = sha256->buffer[0.15]; */\ + static word64 buff[16] ;\ + buff[0] = *(word64*)&sha256->buffer[0] ;\ + buff[1] = *(word64*)&sha256->buffer[2] ;\ + __asm__ volatile("vmovaps %1, %%xmm4\n\t"\ + "vpshufb %0, %%xmm4, %%xmm4\n\t"\ + : "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff) :) ;\ + buff[2] = *(word64*)&sha256->buffer[4] ;\ + buff[3] = *(word64*)&sha256->buffer[6] ;\ + __asm__ volatile("\n\tvmovaps %1, %%xmm5\n\t"\ + "vpshufb %0, %%xmm5, %%xmm5\n\t"\ + : "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[2]) :) ;\ + buff[4] = *(word64*)&sha256->buffer[8] ;\ + buff[5] = *(word64*)&sha256->buffer[10] ;\ + __asm__ volatile("vmovaps %1, %%xmm6\n\t"\ + "vpshufb %0, %%xmm6, %%xmm6\n\t"\ + : "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[4]) :) ;\ + buff[6] = *(word64*)&sha256->buffer[12] ;\ + buff[7] = *(word64*)&sha256->buffer[14] ;\ + __asm__ volatile("vmovaps %1, %%xmm7\n\t"\ + "vpshufb %0, %%xmm7, %%xmm7\n\t"\ + : "=m"(mBYTE_FLIP_MASK[0]),"=m"(buff[6]) :) ;\ + } + +#define _SET_W_K_XFER(reg, i)\ + __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\ + __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ; + +#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) + +static word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */ +static word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */ +static word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + + +#define _Init_Masks(mask1, mask2, mask3)\ +__asm__ volatile("vmovaps %0, %"#mask1 :"=m"(mBYTE_FLIP_MASK)) ;\ +__asm__ volatile("vmovaps %0, %"#mask2 :"=m"(mSHUF_00BA)) ;\ +__asm__ volatile("vmovaps %0, %"#mask3 :"=m"(mSHUF_DC00)) ; + +#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ + _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) + +#define X0 %xmm4 +#define X1 %xmm5 +#define X2 %xmm6 +#define X3 %xmm7 +#define X_ X0 + +#define XTMP0 %xmm0 +#define XTMP1 %xmm1 +#define XTMP2 %xmm2 +#define XTMP3 %xmm3 +#define XTMP4 %xmm8 +#define XTMP5 %xmm9 +#define XFER %xmm10 + +#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ +#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ +#define BYTE_FLIP_MASK %xmm13 + +#define XMM_REGs /* Registers are saved in Sha256Update/Finel */ + /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ + +static int Transform_AVX1(Sha256* sha256) +{ + word32 W_K[64] ; /* temp for W+K */ + + #if defined(DEBUG_XMM) + int i, j ; + word32 xmm[29][4*15] ; + #endif + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; + W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + SET_W_K_XFER(X0, 0) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + SET_W_K_XFER(X1, 4) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; + SET_W_K_XFER(X2, 8) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + SET_W_K_XFER(X3, 12) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; + SET_W_K_XFER(X0, 16) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + SET_W_K_XFER(X1, 20) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; + SET_W_K_XFER(X2, 24) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + SET_W_K_XFER(X3, 28) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; + SET_W_K_XFER(X0, 32) ; + MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + SET_W_K_XFER(X1, 36) ; + MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; + SET_W_K_XFER(X2, 40) ; + MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + SET_W_K_XFER(X3, 44) ; + MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, + SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; + + SET_W_K_XFER(X0, 48) ; + SET_W_K_XFER(X1, 52) ; + SET_W_K_XFER(X2, 56) ; + SET_W_K_XFER(X3, 60) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #if defined(DEBUG_XMM) + for(i=0; i<29; i++) { + for(j=0; j<4*14; j+=4) + printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, + xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; + printf("\n") ; + } + + for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; + #endif + + return 0; +} + +#if defined(HAVE_INTEL_RORX) +static int Transform_AVX1_RORX(Sha256* sha256) +{ + word32 W_K[64] ; /* temp for W+K */ + + #if defined(DEBUG_XMM) + int i, j ; + word32 xmm[29][4*15] ; + #endif + + Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; + W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + SET_W_K_XFER(X0, 0) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + SET_W_K_XFER(X1, 4) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; + SET_W_K_XFER(X2, 8) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + SET_W_K_XFER(X3, 12) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; + SET_W_K_XFER(X0, 16) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + SET_W_K_XFER(X1, 20) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; + SET_W_K_XFER(X2, 24) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + SET_W_K_XFER(X3, 28) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; + SET_W_K_XFER(X0, 32) ; + MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + SET_W_K_XFER(X1, 36) ; + MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; + SET_W_K_XFER(X2, 40) ; + MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + SET_W_K_XFER(X3, 44) ; + MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, + XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; + + SET_W_K_XFER(X0, 48) ; + SET_W_K_XFER(X1, 52) ; + SET_W_K_XFER(X2, 56) ; + SET_W_K_XFER(X3, 60) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #if defined(DEBUG_XMM) + for(i=0; i<29; i++) { + for(j=0; j<4*14; j+=4) + printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, + xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; + printf("\n") ; + } + + for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; + #endif + + return 0; +} +#endif /* HAVE_INTEL_RORX */ + +#endif /* HAVE_INTEL_AVX1 */ + + +#if defined(HAVE_INTEL_AVX2) + +#define _DUMP_REG(REG, name)\ + { unsigned int buf[16] ;unsigned int reg[8][8];int k ;\ + __asm__ volatile("vmovdqu %%ymm4, %0 \n\t":"=m"(reg[0][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm5, %0 \n\t":"=m"(reg[1][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm6, %0 \n\t":"=m"(reg[2][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm7, %0 \n\t":"=m"(reg[3][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm8, %0 \n\t":"=m"(reg[4][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm9, %0 \n\t":"=m"(reg[5][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm10, %0 \n\t":"=m"(reg[6][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%ymm11, %0 \n\t":"=m"(reg[7][0])::YMM_REGs);\ + __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\ + printf(" "#name":\t") ; for(k=0; k<8; k++) printf("%08x.", buf[k]) ; printf("\n") ; \ + __asm__ volatile("vmovdqu %0, %%ymm4 \n\t"::"m"(reg[0][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm5 \n\t"::"m"(reg[1][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm6 \n\t"::"m"(reg[2][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm7 \n\t"::"m"(reg[3][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm8 \n\t"::"m"(reg[4][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm9 \n\t"::"m"(reg[5][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm10 \n\t"::"m"(reg[6][0]):YMM_REGs);\ + __asm__ volatile("vmovdqu %0, %%ymm11 \n\t"::"m"(reg[7][0]):YMM_REGs);\ +} + +#if DEBUG + +#define DUMP_REG(REG) _DUMP_REG(REG, #REG) +#define DUMP_REG2(REG) _DUMP_REG(REG, #REG) +#define PRINTF(fmt, ...) + +#else + +#define DUMP_REG(REG) +#define DUMP_REG2(REG) +#define PRINTF(fmt, ...) + +#endif + +#define DUMP_ALL DUMP_REG(W_I_16) ; DUMP_REG(W_I_15) ; DUMP_REG(W_I_7) ; DUMP_REG(W_I_2) ; DUMP_REG(W_I) ; + +#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ; +#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" :: "m"(mem):YMM_REGs) ; +#define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ + :: "m"(map):YMM_REGs) ; +#define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ + #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ; +#define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ + #ymm0"\n\t":: "m"(map):YMM_REGs) ; +#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ + #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ + #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ; +#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ + #src", %%"#dest" ":::YMM_REGs) ; +#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ + #dest" "::"m"(mem):YMM_REGs) ; +#define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ + #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; + +#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_4(ymm, xmm, mem)\ + __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\ + __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; +#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; + +#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ; +#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) + +#define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) +#define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm) +#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) +#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) +#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) +#define XOR(dest, src1, src2) _XOR(dest, src1, src2) +#define OR(dest, src1, src2) _OR(dest, src1, src2) +#define ADD(dest, src1, src2) _ADD(dest, src1, src2) +#define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem) +#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) + +#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); +#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) +#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) + +#define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \ + XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ; +#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); +#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \ + XOR(dest, G_TEMP, dest) ; + +#define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \ + XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ; +#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); +#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \ + XOR(dest, G_TEMP, dest) ; + +#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \ + BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ; +#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; DUMP_REG(YMM_TEMP0) ; \ + MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ; +#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \ + BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ; + +#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\ + MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ; + +#undef voitle + +#define W_I_16 ymm8 +#define W_I_15 ymm9 +#define W_I_7 ymm10 +#define W_I_2 ymm11 +#define W_I ymm12 +#define G_TEMP ymm13 +#define S_TEMP ymm14 +#define YMM_TEMP0 ymm15 +#define YMM_TEMP0x xmm15 +#define W_I_TEMP ymm7 +#define W_K_TEMP ymm15 +#define W_K_TEMPx xmm15 + +#define YMM_REGs /* Registers are saved in Sha256Update/Finel */ + /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ + + +#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + +#define MOVE_7_to_15(w_i_15, w_i_7)\ + __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + +#define MOVE_I_to_7(w_i_7, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\ + +#define MOVE_I_to_2(w_i_2, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\ + +#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ + MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \ + MOVE_7_to_15(w_i_15, w_i_7) ; \ + MOVE_I_to_7(w_i_7, w_i) ; \ + MOVE_I_to_2(w_i_2, w_i) ;\ + +#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d ;\ + __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[0] += d;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[1] += d;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[2] += d;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[3] += d;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[4] += d;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[5] += d;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[6] += d;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\ + sha256->digest[7] += d;\ +} + +#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + { word32 d[8] ;\ + __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\ + __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\ + printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ + __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\ + __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\ +} + + +#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + +#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ + _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) + + + /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ + static const unsigned long mBYTE_FLIP_MASK_16[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_15[] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_7 [] = + { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ; + static const unsigned long mBYTE_FLIP_MASK_2 [] = + { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ; + + static const unsigned long mMAPtoW_I_7[] = + { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ; + static const unsigned long mMAP1toW_I_2[] = + { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ; + static const unsigned long mMAP2toW_I_2[] = + { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ; + static const unsigned long mMAP3toW_I_2[] = + { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ; + +static int Transform_AVX2(Sha256* sha256) +{ + + #ifdef WOLFSSL_SMALL_STACK + word32* W_K; + W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); + if (W_K == NULL) + return MEMORY_E; + #else + static word32 W_K[64]; + #endif + + MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ; + MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ; + MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ; + MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ; + MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ; + + + + DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + ADD_MEM(W_K_TEMP, W_I_16, K[0]) ; + MOVE_to_MEM(W_K[0], W_K_TEMP) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ; + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ; + + ADD_MEM(YMM_TEMP0, W_I, K[8]) ; + MOVE_to_MEM(W_K[8], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[16]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[16], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[24]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[24], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + GAMMA1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[32]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[32], YMM_TEMP0) ; + + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[40]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[40], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[48]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[48], YMM_TEMP0) ; + + /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ + RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + GAMMA0_1(W_I_TEMP, W_I_15) ; + RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + GAMMA0_2(W_I_TEMP, W_I_15) ; + RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; + ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ + RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + ADD(W_I, W_I_7, W_I_TEMP); + RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ + RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + FEEDBACK1_to_W_I_2 ; + RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; + FEEDBACK_to_W_I_7 ; + RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + ADD(W_I_TEMP, W_I_7, W_I_TEMP); + RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ + RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + FEEDBACK2_to_W_I_2 ; + RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ + RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; + FEEDBACK3_to_W_I_2 ; + RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + GAMMA1_1(YMM_TEMP0, W_I_2) ; + RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + GAMMA1_2(YMM_TEMP0, W_I_2) ; + RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; + ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ + RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + DUMP_ALL ; + MOVE_to_REG(YMM_TEMP0, K[56]) ; + RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; + RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; + ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; + MOVE_to_MEM(W_K[56], YMM_TEMP0) ; + + RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; + RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; + RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; + RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; + + RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; + RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; + RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; + RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; + + RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; + + #ifdef WOLFSSL_SMALL_STACK + XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); + #endif + + return 0; +} + +#endif /* HAVE_INTEL_AVX2 */ +#endif /* HAVE_FIPS */ -#endif /* HAVE_FIPS */ #endif /* NO_SHA256 */ diff --git a/wolfcrypt/src/sha512.c b/wolfcrypt/src/sha512.c index f4ba41cfd..48b8a8b0c 100755 --- a/wolfcrypt/src/sha512.c +++ b/wolfcrypt/src/sha512.c @@ -97,6 +97,241 @@ int wc_Sha384Hash(const byte* data, word32 len, byte* out) #endif /* min */ +#if defined(USE_INTEL_SPEEDUP) + #define HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX2 +#endif + +#if defined(HAVE_INTEL_AVX1) +/* #define DEBUG_XMM */ +#endif + +#if defined(HAVE_INTEL_AVX2) +#define HAVE_INTEL_RORX +/* #define DEBUG_YMM */ +#endif + +/***** +Intel AVX1/AVX2 Macro Control Structure + +#if defined(HAVE_INteL_SPEEDUP) + #define HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX2 +#endif + +int InitSha512(Sha512* sha512) { + Save/Recover XMM, YMM + ... + + Check Intel AVX cpuid flags +} + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + Transform_AVX1() ; # Function prototype + Transform_AVX2() ; # +#endif + + _Transform() { # Native Transform Function body + + } + + int Sha512Update() { + Save/Recover XMM, YMM + ... + } + + int Sha512Final() { + Save/Recover XMM, YMM + ... + } + + +#if defined(HAVE_INTEL_AVX1) + + XMM Instructions/inline asm Definitions + +#endif + +#if defined(HAVE_INTEL_AVX2) + + YMM Instructions/inline asm Definitions + +#endif + +#if defnied(HAVE_INTEL_AVX1) + + int Transform_AVX1() { + Stitched Message Sched/Round + } + +#endif + +#if defnied(HAVE_INTEL_AVX2) + + int Transform_AVX2() { + Stitched Message Sched/Round + } +#endif + + +*/ + +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + + +/* Each platform needs to query info type 1 from cpuid to see if aesni is + * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts + */ + +#ifndef _MSC_VER + #define cpuid(reg, leaf, sub)\ + __asm__ __volatile__ ("cpuid":\ + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (leaf), "c"(sub)); + + #define XASM_LINK(f) asm(f) +#else + + #include + #define cpuid(a,b) __cpuid((int*)a,b) + + #define XASM_LINK(f) + +#endif /* _MSC_VER */ + +#define EAX 0 +#define EBX 1 +#define ECX 2 +#define EDX 3 + +#define CPUID_AVX1 0x1 +#define CPUID_AVX2 0x2 +#define CPUID_RDRAND 0x4 +#define CPUID_RDSEED 0x8 + +#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) +#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) +#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) +#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) + +static word32 cpuid_flags = 0 ; + +static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { + int got_intel_cpu=0; + unsigned int reg[5]; + + reg[4] = '\0' ; + cpuid(reg, 0, 0); + if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && + memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && + memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { + got_intel_cpu = 1; + } + if (got_intel_cpu) { + cpuid(reg, leaf, sub); + return((reg[num]>>bit)&0x1) ; + } + return 0 ; +} + +static void set_cpuid_flags(void) { + if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} + if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } + if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } + if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } +} + + +/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ + +#if defined(HAVE_INTEL_AVX1) +static int Transform_AVX1(Sha512 *sha512) ; +#endif + +#if defined(HAVE_INTEL_AVX2) +static int Transform_AVX2(Sha512 *sha512) ; + +#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) +static int Transform_AVX1_RORX(Sha512 *sha512) ; +#endif + +#endif + +static int _Transform(Sha512 *sha512) ; + +static int (*Transform_p)(Sha512* sha512) = _Transform ; + +#define Transform(sha512) (*Transform_p)(sha512) + +static void set_Transform(void) { + set_cpuid_flags() ; + +#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; +#elif defined(HAVE_INTEL_AVX2) + #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) + if(IS_INTEL_AVX2) { Transform_p = Transform_AVX1_RORX ; return ; } + #endif + if(IS_INTEL_AVX2) { Transform_p = Transform_AVX2 ; return ; } + #if defined(HAVE_INTEL_AVX1) + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; + #endif +#else + Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; +#endif +} + +#else + #define Transform(sha512) _Transform(sha512) +#endif + +/* Dummy for saving MM_REGs on behalf of Transform */ +#if defined(HAVE_INTEL_AVX2) +#define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ + "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\ + "%ymm12","%ymm13","%ymm14","%ymm15") +#elif defined(HAVE_INTEL_AVX1) + #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\ + "xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13") +#else +#define SAVE_XMM_YMM +#endif + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + +#include + +#endif /* defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) */ + + +#if defined(HAVE_INTEL_RORX) +#define ROTR(func, bits, x) \ +word64 func(word64 x) { word64 ret ;\ + __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\ + return ret ;\ +} + +static INLINE ROTR(rotrFixed64_28, 28, x) +static INLINE ROTR(rotrFixed64_34, 34, x) +static INLINE ROTR(rotrFixed64_39, 39, x) +static INLINE ROTR(rotrFixed64_14, 14, x) +static INLINE ROTR(rotrFixed64_18, 18, x) +static INLINE ROTR(rotrFixed64_41, 41, x) + +#define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x)) +#define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x)) +#endif + +#if defined(HAVE_BYTEREVERSE64) && !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) +#define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size) +#define ByteReverseWords64_1(buf, size)\ + { unsigned int i ;\ + for(i=0; i< size/sizeof(word64); i++){\ + __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\ + }\ +} +#endif + int wc_InitSha512(Sha512* sha512) { @@ -112,8 +347,12 @@ int wc_InitSha512(Sha512* sha512) sha512->buffLen = 0; sha512->loLen = 0; sha512->hiLen = 0; - - return 0; + +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + set_Transform() ; /* choose best Transform function under this runtime environment */ +#endif + + return 0 ; } @@ -161,7 +400,9 @@ static const word64 K512[80] = { }; + #define blk0(i) (W[i] = sha512->buffer[i]) + #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) #define Ch(x,y,z) (z^(x&(y^z))) @@ -182,24 +423,23 @@ static const word64 K512[80] = { #define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6)) #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\ - d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) #define blk384(i) (W[i] = sha384->buffer[i]) #define R2(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk384(i));\ - d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) - -static int Transform(Sha512* sha512) +static int _Transform(Sha512* sha512) { const word64* K = K512; word32 j; word64 T[8]; + #ifdef WOLFSSL_SMALL_STACK word64* W; - W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER); if (W == NULL) return MEMORY_E; @@ -259,11 +499,11 @@ static INLINE void AddLength(Sha512* sha512, word32 len) sha512->hiLen++; /* carry low to high */ } - int wc_Sha512Update(Sha512* sha512, const byte* data, word32 len) { /* do block size increments */ byte* local = (byte*)sha512->buffer; + SAVE_XMM_YMM ; /* for Intel AVX */ while (len) { word32 add = min(len, SHA512_BLOCK_SIZE - sha512->buffLen); @@ -275,9 +515,11 @@ int wc_Sha512Update(Sha512* sha512, const byte* data, word32 len) if (sha512->buffLen == SHA512_BLOCK_SIZE) { int ret; - - #ifdef LITTLE_ENDIAN_ORDER - ByteReverseWords64(sha512->buffer, sha512->buffer, + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_BLOCK_SIZE); #endif ret = Transform(sha512); @@ -297,6 +539,7 @@ int wc_Sha512Final(Sha512* sha512, byte* hash) byte* local = (byte*)sha512->buffer; int ret; + SAVE_XMM_YMM ; /* for Intel AVX */ AddLength(sha512, sha512->buffLen); /* before adding pads */ local[sha512->buffLen++] = 0x80; /* add 1 */ @@ -305,8 +548,10 @@ int wc_Sha512Final(Sha512* sha512, byte* hash) if (sha512->buffLen > SHA512_PAD_SIZE) { XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE -sha512->buffLen); sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen; - - #ifdef LITTLE_ENDIAN_ORDER + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE); #endif ret = Transform(sha512); @@ -323,13 +568,22 @@ int wc_Sha512Final(Sha512* sha512, byte* hash) sha512->loLen = sha512->loLen << 3; /* store lengths */ - #ifdef LITTLE_ENDIAN_ORDER + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE); #endif /* ! length ordering dependent on digest endian type ! */ + sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; - + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX1 || IS_INTEL_AVX2) + ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]), + SHA512_BLOCK_SIZE - SHA512_PAD_SIZE); + #endif ret = Transform(sha512); if (ret != 0) return ret; @@ -375,9 +629,737 @@ int wc_Sha512Hash(const byte* data, word32 len, byte* hash) return ret; } +#if defined(HAVE_INTEL_AVX1) + +#define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ; +#define Rx_2(i) d(i)+=h(i); +#define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)); + +#if defined(HAVE_INTEL_RORX) +#define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ; +#define Rx_RORX_2(i) d(i)+=h(i); +#define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i)); +#endif + +#endif + +#if defined(HAVE_INTEL_AVX2) +#define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w ; +#define Ry_2(i, w) d(i)+=h(i); +#define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)); +#endif + +#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ +#if defined(DEBUG_XMM) + +#define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0])::XMM_REGs); +#define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]):XMM_REGs); + +#define _DUMP_REG(REG, name)\ + { word64 buf[16] ;word64 reg[16][2];int k ;\ + SAVE_REG(0); SAVE_REG(1); SAVE_REG(2); SAVE_REG(3); SAVE_REG(4); \ + SAVE_REG(5); SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\ + SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \ + __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::XMM_REGs);\ + printf(" "#name":\t") ; for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n") ; \ + RECV_REG(0); RECV_REG(1); RECV_REG(2); RECV_REG(3); RECV_REG(4);\ + RECV_REG(5); RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\ + RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\ + } + +#define DUMP_REG(REG) _DUMP_REG(REG, #REG) +#define PRINTF(fmt, ...) + +#else + +#define DUMP_REG(REG) +#define PRINTF(fmt, ...) + +#endif + +#define _MOVE_to_REG(xymm, mem) __asm__ volatile("vmovdqu %0, %%"#xymm" "\ + :: "m"(mem):XMM_REGs) ; +#define _MOVE_to_MEM(mem,i, xymm) __asm__ volatile("vmovdqu %%"#xymm", %0" :\ + "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::XMM_REGs) ; +#define _MOVE(dest, src) __asm__ volatile("vmovdqu %%"#src", %%"\ + #dest" ":::XMM_REGs) ; + +#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrlq $"#bits", %%"\ + #src", %%"#dest"\n\tvpsllq $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ + #temp",%%"#dest", %%"#dest" ":::XMM_REGs) ; +#define _AVX1_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ + #src", %%"#dest" ":::XMM_REGs) ; +#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ + #src2", %%"#dest" ":::XMM_REGs) ; +#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ + #src2", %%"#dest" ":::XMM_REGs) ; +#define _ADD(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ + #src2", %%"#dest" ":::XMM_REGs) ; +#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddq %0, %%"#src1", %%"\ + #dest" "::"m"(mem):XMM_REGs) ; + +#define MOVE_to_REG(xymm, mem) _MOVE_to_REG(xymm, mem) +#define MOVE_to_MEM(mem, i, xymm) _MOVE_to_MEM(mem, i, xymm) +#define MOVE(dest, src) _MOVE(dest, src) + +#define XOR(dest, src1, src2) _XOR(dest, src1, src2) +#define OR(dest, src1, src2) _OR(dest, src1, src2) +#define ADD(dest, src1, src2) _ADD(dest, src1, src2) + +#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); +#define AVX1_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) +#define AVX1_R(dest, src, bits) _AVX1_R(dest, src, bits) + +#define Init_Mask(mask) \ + __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1") ; + +#define _W_from_buff1(w, buff, xmm) \ + /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15]; */\ + __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\ + "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\ + "vmovdqu %%"#xmm", %0"\ + :"=m"(w): "m"(buff):"%xmm0") ; + +#define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm) + +#define W_from_buff(w, buff)\ + Init_Mask(mBYTE_FLIP_MASK[0]) ;\ + W_from_buff1(w[0], buff[0], W_0);\ + W_from_buff1(w[2], buff[2], W_2);\ + W_from_buff1(w[4], buff[4], W_4);\ + W_from_buff1(w[6], buff[6], W_6);\ + W_from_buff1(w[8], buff[8], W_8);\ + W_from_buff1(w[10],buff[10],W_10);\ + W_from_buff1(w[12],buff[12],W_12);\ + W_from_buff1(w[14],buff[14],W_14); + +static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f } ; + +#define W_I_15 xmm14 +#define W_I_7 xmm11 +#define W_I_2 xmm13 +#define W_I xmm12 +#define G_TEMP xmm0 +#define S_TEMP xmm1 +#define XMM_TEMP0 xmm2 + +#define W_0 xmm12 +#define W_2 xmm3 +#define W_4 xmm4 +#define W_6 xmm5 +#define W_8 xmm6 +#define W_10 xmm7 +#define W_12 xmm8 +#define W_14 xmm9 + +#define XMM_REGs + +#define s0_1(dest, src) AVX1_S(dest, src, 1); +#define s0_2(dest, src) AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest) ; +#define s0_3(dest, src) AVX1_R(G_TEMP, src, 7); XOR(dest, G_TEMP, dest) ; + +#define s1_1(dest, src) AVX1_S(dest, src, 19); +#define s1_2(dest, src) AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest) ; +#define s1_3(dest, src) AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest) ; + +#define s0_(dest, src) s0_1(dest, src) ; s0_2(dest, src) ; s0_3(dest, src) +#define s1_(dest, src) s1_1(dest, src) ; s1_2(dest, src) ; s1_3(dest, src) + +static word64 W_X[16+4]; +#define Block_xx_1(i) \ + MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\ + MOVE_to_REG(W_I_7, W_X[(i- 7)&15]) ;\ + +#define Block_xx_2(i) \ + MOVE_to_REG(W_I_2, W_X[(i- 2)&15]) ;\ + MOVE_to_REG(W_I, W_X[(i)]) ;\ + +#define Block_xx_3(i) \ + s0_ (XMM_TEMP0, W_I_15) ;\ + +#define Block_xx_4(i) \ + ADD(W_I, W_I, XMM_TEMP0) ;\ + ADD(W_I, W_I, W_I_7) ;\ + +#define Block_xx_5(i) \ + s1_ (XMM_TEMP0, W_I_2) ;\ + +#define Block_xx_6(i) \ + ADD(W_I, W_I, XMM_TEMP0) ;\ + MOVE_to_MEM(W_X,i, W_I) ;\ + if(i==0)\ + MOVE_to_MEM(W_X,16, W_I) ;\ + +#define Block_xx_7(i) \ + MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\ + MOVE_to_REG(W_I_7, W_X[(i- 7)&15]) ;\ + +#define Block_xx_8(i) \ + MOVE_to_REG(W_I_2, W_X[(i- 2)&15]) ;\ + MOVE_to_REG(W_I, W_X[(i)]) ;\ + +#define Block_xx_9(i) \ + s0_ (XMM_TEMP0, W_I_15) ;\ + +#define Block_xx_10(i) \ + ADD(W_I, W_I, XMM_TEMP0) ;\ + ADD(W_I, W_I, W_I_7) ;\ + +#define Block_xx_11(i) \ + s1_ (XMM_TEMP0, W_I_2) ;\ + +#define Block_xx_12(i) \ + ADD(W_I, W_I, XMM_TEMP0) ;\ + MOVE_to_MEM(W_X,i, W_I) ;\ + if((i)==0)\ + MOVE_to_MEM(W_X,16, W_I) ;\ + +static inline void Block_0_1(void) { Block_xx_1(0) ; } +static inline void Block_0_2(void) { Block_xx_2(0) ; } +static inline void Block_0_3(void) { Block_xx_3(0) ; } +static inline void Block_0_4(void) { Block_xx_4(0) ; } +static inline void Block_0_5(void) { Block_xx_5(0) ; } +static inline void Block_0_6(void) { Block_xx_6(0) ; } +static inline void Block_0_7(void) { Block_xx_7(2) ; } +static inline void Block_0_8(void) { Block_xx_8(2) ; } +static inline void Block_0_9(void) { Block_xx_9(2) ; } +static inline void Block_0_10(void){ Block_xx_10(2) ; } +static inline void Block_0_11(void){ Block_xx_11(2) ; } +static inline void Block_0_12(void){ Block_xx_12(2) ; } + +static inline void Block_4_1(void) { Block_xx_1(4) ; } +static inline void Block_4_2(void) { Block_xx_2(4) ; } +static inline void Block_4_3(void) { Block_xx_3(4) ; } +static inline void Block_4_4(void) { Block_xx_4(4) ; } +static inline void Block_4_5(void) { Block_xx_5(4) ; } +static inline void Block_4_6(void) { Block_xx_6(4) ; } +static inline void Block_4_7(void) { Block_xx_7(6) ; } +static inline void Block_4_8(void) { Block_xx_8(6) ; } +static inline void Block_4_9(void) { Block_xx_9(6) ; } +static inline void Block_4_10(void){ Block_xx_10(6) ; } +static inline void Block_4_11(void){ Block_xx_11(6) ; } +static inline void Block_4_12(void){ Block_xx_12(6) ; } + +static inline void Block_8_1(void) { Block_xx_1(8) ; } +static inline void Block_8_2(void) { Block_xx_2(8) ; } +static inline void Block_8_3(void) { Block_xx_3(8) ; } +static inline void Block_8_4(void) { Block_xx_4(8) ; } +static inline void Block_8_5(void) { Block_xx_5(8) ; } +static inline void Block_8_6(void) { Block_xx_6(8) ; } +static inline void Block_8_7(void) { Block_xx_7(10) ; } +static inline void Block_8_8(void) { Block_xx_8(10) ; } +static inline void Block_8_9(void) { Block_xx_9(10) ; } +static inline void Block_8_10(void){ Block_xx_10(10) ; } +static inline void Block_8_11(void){ Block_xx_11(10) ; } +static inline void Block_8_12(void){ Block_xx_12(10) ; } + +static inline void Block_12_1(void) { Block_xx_1(12) ; } +static inline void Block_12_2(void) { Block_xx_2(12) ; } +static inline void Block_12_3(void) { Block_xx_3(12) ; } +static inline void Block_12_4(void) { Block_xx_4(12) ; } +static inline void Block_12_5(void) { Block_xx_5(12) ; } +static inline void Block_12_6(void) { Block_xx_6(12) ; } +static inline void Block_12_7(void) { Block_xx_7(14) ; } +static inline void Block_12_8(void) { Block_xx_8(14) ; } +static inline void Block_12_9(void) { Block_xx_9(14) ; } +static inline void Block_12_10(void){ Block_xx_10(14) ; } +static inline void Block_12_11(void){ Block_xx_11(14) ; } +static inline void Block_12_12(void){ Block_xx_12(14) ; } + +#endif + +#if defined(HAVE_INTEL_AVX2) +static unsigned long mBYTE_FLIP_MASK_Y[] = + { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f } ; + +#define W_from_buff_Y(buff)\ + { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15]; */\ + __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]):"%ymm1") ;\ + __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\ + "vmovdqu %1, %%ymm4\n\t"\ + "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\ + "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\ + :: "m"(buff[0]), "m"(buff[4]):"%ymm8","%ymm9","%ymm10") ;\ + __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\ + "vmovdqu %1, %%ymm6\n\t"\ + "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\ + "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\ + :: "m"(buff[8]), "m"(buff[12]):"%ymm8","%ymm9","%ymm10") ;\ + } + +#if defined(DEBUG_YMM) + +#define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0])::YMM_REGs); +#define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]):YMM_REGs); + +#define _DUMP_REG_Y(REG, name)\ + { word64 buf[16] ;word64 reg[16][2];int k ;\ + SAVE_REG_Y(4); SAVE_REG_Y(5); SAVE_REG_Y(6); SAVE_REG_Y(7); \ + SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\ + SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \ + __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\ + printf(" "#name":\t") ; for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]) ; printf("\n") ; \ + RECV_REG_Y(4); RECV_REG_Y(5); RECV_REG_Y(6); RECV_REG_Y(7); \ + RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \ + RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\ + } + +#define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG) +#define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG) +#define PRINTF_Y(fmt, ...) + +#else + +#define DUMP_REG_Y(REG) +#define DUMP_REG2_Y(REG) +#define PRINTF_Y(fmt, ...) + +#endif + +#define _MOVE_to_REGy(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" "\ + :: "m"(mem):YMM_REGs) ; +#define _MOVE_to_MEMy(mem,i, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" \ + : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::YMM_REGs) ; +#define _MOVE_128y(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"\ + #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ; +#define _S_TEMPy(dest, src, bits, temp) \ + __asm__ volatile("vpsrlq $"#bits", %%"#src", %%"#dest"\n\tvpsllq $64-"#bits\ + ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" ":::YMM_REGs) ; +#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\ + #src", %%"#dest" ":::YMM_REGs) ; +#define _XORy(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _ADDy(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\ + #src2", %%"#dest" ":::YMM_REGs) ; +#define _BLENDy(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ + #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; +#define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd $"#map", %%"\ + #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; +#define _PERMQy(map, dest, src) __asm__ volatile("vpermq $"#map", %%"\ + #src", %%"#dest" ":::YMM_REGs) ; + +#define MOVE_to_REGy(ymm, mem) _MOVE_to_REGy(ymm, mem) +#define MOVE_to_MEMy(mem, i, ymm) _MOVE_to_MEMy(mem, i, ymm) + +#define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map) +#define XORy(dest, src1, src2) _XORy(dest, src1, src2) +#define ADDy(dest, src1, src2) _ADDy(dest, src1, src2) +#define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2) +#define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2) +#define PERMQy(map, dest, src) _PERMQy(map, dest, src) + + +#define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp); +#define AVX2_S(dest, src, bits) S_TMPy(dest, src, bits, S_TEMPy) +#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) + + +#define FEEDBACK1_to_W_I_2(w_i_2, w_i) MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08) ;\ + BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2) ; + +#define MOVE_W_to_W_I_15(w_i_15, w_0, w_4) BLENDQy(0x1, w_i_15, w_4, w_0) ;\ + PERMQy(0x39, w_i_15, w_i_15) ; +#define MOVE_W_to_W_I_7(w_i_7, w_8, w_12) BLENDQy(0x1, w_i_7, w_12, w_8) ;\ + PERMQy(0x39, w_i_7, w_i_7) ; +#define MOVE_W_to_W_I_2(w_i_2, w_12) BLENDQy(0xc, w_i_2, w_12, w_i_2) ;\ + PERMQy(0x0e, w_i_2, w_i_2) ; + + +#define W_I_16y ymm8 +#define W_I_15y ymm9 +#define W_I_7y ymm10 +#define W_I_2y ymm11 +#define W_Iy ymm12 +#define G_TEMPy ymm13 +#define S_TEMPy ymm14 +#define YMM_TEMP0 ymm15 +#define YMM_TEMP0x xmm15 +#define W_I_TEMPy ymm7 +#define W_K_TEMPy ymm15 +#define W_K_TEMPx xmm15 +#define W_0y ymm12 +#define W_4y ymm4 +#define W_8y ymm5 +#define W_12y ymm6 + +#define YMM_REGs +/* Registers are saved in Sha512Update/Final */ + /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ + +#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ + +#define MOVE_7_to_15(w_i_15, w_i_7)\ + __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ + +#define MOVE_I_to_7(w_i_7, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\ + +#define MOVE_I_to_2(w_i_2, w_i)\ + __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\ + __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\ + +#endif + +/*** Transform Body ***/ +#if defined(HAVE_INTEL_AVX1) + +static int Transform_AVX1(Sha512* sha512) +{ + const word64* K = K512; + + word32 j; + word64 T[8]; + /* Copy digest to working vars */ + XMEMCPY(T, sha512->digest, sizeof(T)); + + W_from_buff(W_X, sha512->buffer) ; + for (j = 0; j < 80; j += 16) { + Rx_1( 0); Block_0_1(); Rx_2( 0); Block_0_2(); Rx_3( 0); Block_0_3(); + Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(); + Rx_1( 2); Block_0_7(); Rx_2( 2); Block_0_8(); Rx_3( 2); Block_0_9(); + Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(); + + Rx_1( 4); Block_4_1(); Rx_2( 4); Block_4_2(); Rx_3( 4); Block_4_3(); + Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(); + Rx_1( 6); Block_4_7(); Rx_2( 6); Block_4_8(); Rx_3( 6); Block_4_9(); + Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(); + + Rx_1( 8); Block_8_1(); Rx_2( 8); Block_8_2(); Rx_3( 8); Block_8_3(); + Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(); + Rx_1(10); Block_8_7(); Rx_2(10); Block_8_8(); Rx_3(10); Block_8_9(); + Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(); + + Rx_1(12); Block_12_1(); Rx_2(12); Block_12_2(); Rx_3(12); Block_12_3(); + Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(); + Rx_1(14); Block_12_7(); Rx_2(14); Block_12_8(); Rx_3(14); Block_12_9(); + Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(); + } + + /* Add the working vars back into digest */ + + sha512->digest[0] += a(0); + sha512->digest[1] += b(0); + sha512->digest[2] += c(0); + sha512->digest[3] += d(0); + sha512->digest[4] += e(0); + sha512->digest[5] += f(0); + sha512->digest[6] += g(0); + sha512->digest[7] += h(0); + + /* Wipe variables */ + #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) + XMEMSET(W_X, 0, sizeof(word64) * 16); + #endif + XMEMSET(T, 0, sizeof(T)); + + return 0; +} + +#endif + +#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) + +static int Transform_AVX1_RORX(Sha512* sha512) +{ + const word64* K = K512; + + word32 j; + word64 T[8]; + /* Copy digest to working vars */ + XMEMCPY(T, sha512->digest, sizeof(T)); + + W_from_buff(W_X, sha512->buffer) ; + for (j = 0; j < 80; j += 16) { + Rx_RORX_1( 0); Block_0_1(); Rx_RORX_2( 0); Block_0_2(); + Rx_RORX_3( 0); Block_0_3(); + Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5(); + Rx_RORX_3( 1); Block_0_6(); + Rx_RORX_1( 2); Block_0_7(); Rx_RORX_2( 2); Block_0_8(); + Rx_RORX_3( 2); Block_0_9(); + Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11(); + Rx_RORX_3( 3); Block_0_12(); + + Rx_RORX_1( 4); Block_4_1(); Rx_RORX_2( 4); Block_4_2(); + Rx_RORX_3( 4); Block_4_3(); + Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5(); + Rx_RORX_3( 5); Block_4_6(); + Rx_RORX_1( 6); Block_4_7(); Rx_RORX_2( 6); Block_4_8(); + Rx_RORX_3( 6); Block_4_9(); + Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11(); + Rx_RORX_3( 7); Block_4_12(); + + Rx_RORX_1( 8); Block_8_1(); Rx_RORX_2( 8); Block_8_2(); + Rx_RORX_3( 8); Block_8_3(); + Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5(); + Rx_RORX_3( 9); Block_8_6(); + Rx_RORX_1(10); Block_8_7(); Rx_RORX_2(10); Block_8_8(); + Rx_RORX_3(10); Block_8_9(); + Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11(); + Rx_RORX_3(11); Block_8_12(); + + Rx_RORX_1(12); Block_12_1(); Rx_RORX_2(12); Block_12_2(); + Rx_RORX_3(12); Block_12_3(); + Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5(); + Rx_RORX_3(13); Block_12_6(); + Rx_RORX_1(14); Block_12_7(); Rx_RORX_2(14); Block_12_8(); + Rx_RORX_3(14); Block_12_9(); + Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11(); + Rx_RORX_3(15); Block_12_12(); + } + /* Add the working vars back into digest */ + + sha512->digest[0] += a(0); + sha512->digest[1] += b(0); + sha512->digest[2] += c(0); + sha512->digest[3] += d(0); + sha512->digest[4] += e(0); + sha512->digest[5] += f(0); + sha512->digest[6] += g(0); + sha512->digest[7] += h(0); + + /* Wipe variables */ + #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) + XMEMSET(W_X, 0, sizeof(word64) * 16); + #endif + XMEMSET(T, 0, sizeof(T)); + + return 0; +} +#endif + +#if defined(HAVE_INTEL_AVX2) + +#define s0_1y(dest, src) AVX2_S(dest, src, 1); +#define s0_2y(dest, src) AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest) ; +#define s0_3y(dest, src) AVX2_R(G_TEMPy, src, 7); XORy(dest, G_TEMPy, dest) ; + +#define s1_1y(dest, src) AVX2_S(dest, src, 19); +#define s1_2y(dest, src) AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest) ; +#define s1_3y(dest, src) AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest) ; + +#define s0_y(dest, src) s0_1y(dest, src) ; s0_2y(dest, src) ; s0_3y(dest, src) +#define s1_y(dest, src) s1_1y(dest, src) ; s1_2y(dest, src) ; s1_3y(dest, src) + +#define blk384(i) (W[i] = sha384->buffer[i]) + +static word64 w[4] ; + +#define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\ + MOVE_W_to_W_I_15(W_I_15y, w_0, w_4) ;\ + MOVE_W_to_W_I_7 (W_I_7y, w_8, w_12) ;\ + MOVE_W_to_W_I_2 (W_I_2y, w_12) ;\ + +#define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\ + s0_1y (YMM_TEMP0, W_I_15y) ;\ + +#define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\ + s0_2y (YMM_TEMP0, W_I_15y) ;\ + +#define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\ + s0_3y (YMM_TEMP0, W_I_15y) ;\ + +#define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\ + ADDy(W_I_TEMPy, w_0, YMM_TEMP0) ;\ + +#define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\ + ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y) ;\ + s1_1y (YMM_TEMP0, W_I_2y) ;\ + +#define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\ + s1_2y (YMM_TEMP0, W_I_2y) ;\ + +#define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\ + s1_3y (YMM_TEMP0, W_I_2y) ;\ + ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\ + +#define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\ + FEEDBACK1_to_W_I_2(W_I_2y, w_0) ;\ + +#define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \ + s1_1y (YMM_TEMP0, W_I_2y) ;\ + +#define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \ + s1_2y (YMM_TEMP0, W_I_2y) ;\ + +#define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\ + s1_3y (YMM_TEMP0, W_I_2y) ;\ + ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\ + MOVE_to_MEMy(w,0, w_4) ;\ + + +static inline void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y) ; } +static inline void Block_Y_0_12(void){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y) ; } + +static inline void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y) ; } +static inline void Block_Y_4_12(void) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y) ; } + +static inline void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y) ; } +static inline void Block_Y_8_12(void) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y) ; } + +static inline void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y) ; } +static inline void Block_Y_12_12(void) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y) ; } + + +static int Transform_AVX2(Sha512* sha512) +{ + const word64* K = K512; + + word32 j /*, k*/; + word64 T[8]; + /* Copy digest to working vars */ + XMEMCPY(T, sha512->digest, sizeof(T)); + + W_from_buff_Y(sha512->buffer) ; + MOVE_to_MEMy(w,0, W_0y) ; + for (j = 0; j < 80; j += 16) { + Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2(); + Ry_3( 0, w[0]); Block_Y_0_3(); + Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5(); + Ry_3( 1, w[1]); Block_Y_0_6(); + Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8(); + Ry_3( 2, w[2]); Block_Y_0_9(); + Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11(); + Ry_3( 3, w[3]); Block_Y_0_12(); + + Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2(); + Ry_3( 4, w[0]); Block_Y_4_3(); + Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5(); + Ry_3( 5, w[1]); Block_Y_4_6(); + Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8(); + Ry_3( 6, w[2]); Block_Y_4_9(); + Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11(); + Ry_3( 7, w[3]);Block_Y_4_12(); + + Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2(); + Ry_3( 8, w[0]); Block_Y_8_3(); + Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5(); + Ry_3( 9, w[1]); Block_Y_8_6(); + Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8(); + Ry_3(10, w[2]); Block_Y_8_9(); + Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11(); + Ry_3(11, w[3]); Block_Y_8_12(); + + Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2(); + Ry_3(12, w[0]); Block_Y_12_3(); + Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5(); + Ry_3(13, w[1]); Block_Y_12_6(); + Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8(); + Ry_3(14, w[2]); Block_Y_12_9(); + Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11(); + Ry_3(15, w[3]);Block_Y_12_12(); + } + + /* Add the working vars back into digest */ + + sha512->digest[0] += a(0); + sha512->digest[1] += b(0); + sha512->digest[2] += c(0); + sha512->digest[3] += d(0); + sha512->digest[4] += e(0); + sha512->digest[5] += f(0); + sha512->digest[6] += g(0); + sha512->digest[7] += h(0); + + /* Wipe variables */ + #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) + XMEMSET(W, 0, sizeof(word64) * 16); + #endif + XMEMSET(T, 0, sizeof(T)); + + return 0; +} + +#endif + #ifdef WOLFSSL_SHA384 +#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + +#if defined(HAVE_INTEL_AVX1) +static int Transform384_AVX1(Sha384 *sha384) ; +#endif +#if defined(HAVE_INTEL_AVX2) +static int Transform384_AVX2(Sha384 *sha384) ; +#endif + +#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) &&defined(HAVE_INTEL_RORX) +static int Transform384_AVX1_RORX(Sha384 *sha384) ; +#endif + +static int _Transform384(Sha384 *sha384) ; +static int (*Transform384_p)(Sha384* sha384) = _Transform384 ; + +#define Transform384(sha384) (*Transform384_p)(sha384) +static void set_Transform384(void) { + set_cpuid_flags() ; + +#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) + Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ; +#elif defined(HAVE_INTEL_AVX2) + #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX) + if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX1_RORX ; return ; } + #endif + if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX2 ; return ; } + #if defined(HAVE_INTEL_AVX1) + Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ; + #endif +#else + Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ; +#endif +} + +#else + #define Transform384(sha512) _Transform384(sha512) +#endif + int wc_InitSha384(Sha384* sha384) { sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8); @@ -393,11 +1375,14 @@ int wc_InitSha384(Sha384* sha384) sha384->loLen = 0; sha384->hiLen = 0; +#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) + set_Transform384() ; +#endif + return 0; } - -static int Transform384(Sha384* sha384) +static int _Transform384(Sha384* sha384) { const word64* K = K512; @@ -448,8 +1433,8 @@ static int Transform384(Sha384* sha384) sha384->digest[7] += h(0); /* Wipe variables */ - ForceZero(W, sizeof(word64) * 16); - ForceZero(T, sizeof(T)); + XMEMSET(W, 0, sizeof(word64) * 16); + XMEMSET(T, 0, sizeof(T)); #ifdef WOLFSSL_SMALL_STACK XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); @@ -458,7 +1443,6 @@ static int Transform384(Sha384* sha384) return 0; } - static INLINE void AddLength384(Sha384* sha384, word32 len) { word32 tmp = sha384->loLen; @@ -466,12 +1450,13 @@ static INLINE void AddLength384(Sha384* sha384, word32 len) sha384->hiLen++; /* carry low to high */ } - int wc_Sha384Update(Sha384* sha384, const byte* data, word32 len) { /* do block size increments */ byte* local = (byte*)sha384->buffer; - + + SAVE_XMM_YMM ; /* for Intel AVX */ + while (len) { word32 add = min(len, SHA384_BLOCK_SIZE - sha384->buffLen); XMEMCPY(&local[sha384->buffLen], data, add); @@ -483,8 +1468,11 @@ int wc_Sha384Update(Sha384* sha384, const byte* data, word32 len) if (sha384->buffLen == SHA384_BLOCK_SIZE) { int ret; - #ifdef LITTLE_ENDIAN_ORDER - ByteReverseWords64(sha384->buffer, sha384->buffer, + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords64(sha384->buffer, sha384->buffer, SHA384_BLOCK_SIZE); #endif ret = Transform384(sha384); @@ -504,6 +1492,7 @@ int wc_Sha384Final(Sha384* sha384, byte* hash) byte* local = (byte*)sha384->buffer; int ret; + SAVE_XMM_YMM ; /* for Intel AVX */ AddLength384(sha384, sha384->buffLen); /* before adding pads */ local[sha384->buffLen++] = 0x80; /* add 1 */ @@ -513,8 +1502,12 @@ int wc_Sha384Final(Sha384* sha384, byte* hash) XMEMSET(&local[sha384->buffLen], 0, SHA384_BLOCK_SIZE -sha384->buffLen); sha384->buffLen += SHA384_BLOCK_SIZE - sha384->buffLen; - #ifdef LITTLE_ENDIAN_ORDER - ByteReverseWords64(sha384->buffer,sha384->buffer,SHA384_BLOCK_SIZE); + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords64(sha384->buffer, sha384->buffer, + SHA384_BLOCK_SIZE); #endif ret = Transform384(sha384); if (ret != 0) @@ -530,13 +1523,22 @@ int wc_Sha384Final(Sha384* sha384, byte* hash) sha384->loLen = sha384->loLen << 3; /* store lengths */ - #ifdef LITTLE_ENDIAN_ORDER - ByteReverseWords64(sha384->buffer, sha384->buffer, SHA384_PAD_SIZE); + #if defined(LITTLE_ENDIAN_ORDER) + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) + #endif + ByteReverseWords64(sha384->buffer, sha384->buffer, + SHA384_BLOCK_SIZE); #endif /* ! length ordering dependent on digest endian type ! */ sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2] = sha384->hiLen; sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 1] = sha384->loLen; - + #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) + if(IS_INTEL_AVX1 || IS_INTEL_AVX2) + ByteReverseWords64(&(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]), + &(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]), + SHA384_BLOCK_SIZE - SHA384_PAD_SIZE); + #endif ret = Transform384(sha384); if (ret != 0) return ret; @@ -582,6 +1584,208 @@ int wc_Sha384Hash(const byte* data, word32 len, byte* hash) return ret; } +#if defined(HAVE_INTEL_AVX1) + +static int Transform384_AVX1(Sha384* sha384) +{ + const word64* K = K512; + + word32 j; + word64 T[8]; + + /* Copy digest to working vars */ + XMEMCPY(T, sha384->digest, sizeof(T)); + W_from_buff(W_X, sha384->buffer) ; + for (j = 0; j < 80; j += 16) { + Rx_1( 0); Block_0_1(); Rx_2( 0); Block_0_2(); Rx_3( 0); Block_0_3(); + Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(); + Rx_1( 2); Block_0_7(); Rx_2( 2); Block_0_8(); Rx_3( 2); Block_0_9(); + Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(); + + Rx_1( 4); Block_4_1(); Rx_2( 4); Block_4_2(); Rx_3( 4); Block_4_3(); + Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(); + Rx_1( 6); Block_4_7(); Rx_2( 6); Block_4_8(); Rx_3( 6); Block_4_9(); + Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(); + + Rx_1( 8); Block_8_1(); Rx_2( 8); Block_8_2(); Rx_3( 8); Block_8_3(); + Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(); + Rx_1(10); Block_8_7(); Rx_2(10); Block_8_8(); Rx_3(10); Block_8_9(); + Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(); + + Rx_1(12); Block_12_1(); Rx_2(12); Block_12_2(); Rx_3(12); Block_12_3(); + Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(); + Rx_1(14); Block_12_7(); Rx_2(14); Block_12_8(); Rx_3(14); Block_12_9(); + Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(); + } + + /* Add the working vars back into digest */ + + sha384->digest[0] += a(0); + sha384->digest[1] += b(0); + sha384->digest[2] += c(0); + sha384->digest[3] += d(0); + sha384->digest[4] += e(0); + sha384->digest[5] += f(0); + sha384->digest[6] += g(0); + sha384->digest[7] += h(0); + + /* Wipe variables */ + #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) + XMEMSET(W, 0, sizeof(word64) * 16); + #endif + XMEMSET(T, 0, sizeof(T)); + + return 0; +} + +#endif + +#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) +static int Transform384_AVX1_RORX(Sha384* sha384) +{ + const word64* K = K512; + + word32 j; + word64 T[8]; + + /* Copy digest to working vars */ + XMEMCPY(T, sha384->digest, sizeof(T)); + + W_from_buff(W_X, sha384->buffer) ; + for (j = 0; j < 80; j += 16) { + Rx_RORX_1( 0); Block_0_1(); Rx_RORX_2( 0); + Block_0_2(); Rx_RORX_3( 0); Block_0_3(); + Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); + Block_0_5(); Rx_RORX_3( 1); Block_0_6(); + Rx_RORX_1( 2); Block_0_7(); Rx_RORX_2( 2); + Block_0_8(); Rx_RORX_3( 2); Block_0_9(); + Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); + Block_0_11();Rx_RORX_3( 3); Block_0_12(); + + Rx_RORX_1( 4); Block_4_1(); Rx_RORX_2( 4); + Block_4_2(); Rx_RORX_3( 4); Block_4_3(); + Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); + Block_4_5(); Rx_RORX_3( 5); Block_4_6(); + Rx_RORX_1( 6); Block_4_7(); Rx_RORX_2( 6); + Block_4_8(); Rx_RORX_3( 6); Block_4_9(); + Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); + Block_4_11();Rx_RORX_3( 7); Block_4_12(); + + Rx_RORX_1( 8); Block_8_1(); Rx_RORX_2( 8); + Block_8_2(); Rx_RORX_3( 8); Block_8_3(); + Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); + Block_8_5(); Rx_RORX_3( 9); Block_8_6(); + Rx_RORX_1(10); Block_8_7(); Rx_RORX_2(10); + Block_8_8(); Rx_RORX_3(10); Block_8_9(); + Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); + Block_8_11();Rx_RORX_3(11); Block_8_12(); + + Rx_RORX_1(12); Block_12_1(); Rx_RORX_2(12); + Block_12_2(); Rx_RORX_3(12); Block_12_3(); + Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); + Block_12_5(); Rx_RORX_3(13); Block_12_6(); + Rx_RORX_1(14); Block_12_7(); Rx_RORX_2(14); + Block_12_8(); Rx_RORX_3(14); Block_12_9(); + Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); + Block_12_11();Rx_RORX_3(15); Block_12_12(); + } + + /* Add the working vars back into digest */ + + sha384->digest[0] += a(0); + sha384->digest[1] += b(0); + sha384->digest[2] += c(0); + sha384->digest[3] += d(0); + sha384->digest[4] += e(0); + sha384->digest[5] += f(0); + sha384->digest[6] += g(0); + sha384->digest[7] += h(0); + + /* Wipe variables */ + #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2) + XMEMSET(W, 0, sizeof(word64) * 16); + #endif + XMEMSET(T, 0, sizeof(T)); + + return 0; +} +#endif + +#if defined(HAVE_INTEL_AVX2) + +static int Transform384_AVX2(Sha384* sha384) +{ + const word64* K = K512; + + word32 j; + word64 T[8]; + + /* Copy digest to working vars */ + XMEMCPY(T, sha384->digest, sizeof(T)); + + /* over twice as small, but 50% slower */ + /* 80 operations, not unrolled */ + + W_from_buff_Y(sha384->buffer) ; + + MOVE_to_MEMy(w,0, W_0y) ; + for (j = 0; j < 80; j += 16) { + Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); + Block_Y_0_2(); Ry_3( 0, w[0]); Block_Y_0_3(); + Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); + Block_Y_0_5(); Ry_3( 1, w[1]); Block_Y_0_6(); + Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); + Block_Y_0_8(); Ry_3( 2, w[2]); Block_Y_0_9(); + Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); + Block_Y_0_11();Ry_3( 3, w[3]); Block_Y_0_12(); + + Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); + Block_Y_4_2(); Ry_3( 4, w[0]); Block_Y_4_3(); + Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); + Block_Y_4_5(); Ry_3( 5, w[1]); Block_Y_4_6(); + Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); + Block_Y_4_8(); Ry_3( 6, w[2]); Block_Y_4_9(); + Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]); + Block_Y_4_11(); Ry_3( 7, w[3]);Block_Y_4_12(); + + Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); + Block_Y_8_2(); Ry_3( 8, w[0]); Block_Y_8_3(); + Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); + Block_Y_8_5(); Ry_3( 9, w[1]); Block_Y_8_6(); + Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); + Block_Y_8_8(); Ry_3(10, w[2]); Block_Y_8_9(); + Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); + Block_Y_8_11();Ry_3(11, w[3]); Block_Y_8_12(); + + Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); + Block_Y_12_2(); Ry_3(12, w[0]); Block_Y_12_3(); + Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); + Block_Y_12_5(); Ry_3(13, w[1]); Block_Y_12_6(); + Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); + Block_Y_12_8(); Ry_3(14, w[2]); Block_Y_12_9(); + Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); + Block_Y_12_11();Ry_3(15, w[3]); Block_Y_12_12(); + } + + /* Add the working vars back into digest */ + + sha384->digest[0] += a(0); + sha384->digest[1] += b(0); + sha384->digest[2] += c(0); + sha384->digest[3] += d(0); + sha384->digest[4] += e(0); + sha384->digest[5] += f(0); + sha384->digest[6] += g(0); + sha384->digest[7] += h(0); + + /* Wipe variables */ + XMEMSET(T, 0, sizeof(T)); + + return 0; +} + +#endif + #endif /* WOLFSSL_SHA384 */ #endif /* HAVE_FIPS */